Add gallery_dl tweet extract method

This commit is contained in:
Dylan 2023-07-02 21:21:45 +01:00
parent abedae385c
commit 23c5aaccd1
2 changed files with 89 additions and 52 deletions

View File

@ -23,7 +23,7 @@ testQrtVideoTweet="https://twitter.com/Twitter/status/1494436688554344449"
testNSFWTweet="https://twitter.com/kuyacoy/status/1581185279376838657" testNSFWTweet="https://twitter.com/kuyacoy/status/1581185279376838657"
textVNF_compare = {'tweet': 'https://twitter.com/jack/status/20', 'url': '', 'description': 'just setting up my twttr', 'screen_name': 'jack', 'type': 'Text', 'images': ['', '', '', '', ''], 'time': 'Tue Mar 21 20:50:14 +0000 2006', 'qrtURL': None, 'nsfw': False} textVNF_compare = {'tweet': 'https://twitter.com/jack/status/20', 'url': '', 'description': 'just setting up my twttr', 'screen_name': 'jack', 'type': 'Text', 'images': ['', '', '', '', ''], 'time': 'Tue Mar 21 20:50:14 +0000 2006', 'qrtURL': None, 'nsfw': False}
videoVNF_compare={'tweet': 'https://twitter.com/Twitter/status/1263145271946551300', 'url': 'https://video.twimg.com/amplify_video/1263145212760805376/vid/1280x720/9jous8HM0_duxL0w.mp4?tag=13', 'description': 'Testing, testing...\n\nA new way to have a convo with exactly who you want. Were starting with a small % globally, so keep your 👀 out to see it in action. https://t.co/pV53mvjAVT', 'thumbnail': 'http://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg', 'screen_name': 'Twitter', 'type': 'Video', 'images': ['', '', '', '', ''], 'time': 'Wed May 20 16:31:15 +0000 2020', 'qrtURL': None, 'nsfw': False,'verified': True, 'size': {'width': 1920, 'height': 1080}} videoVNF_compare={'tweet': 'https://twitter.com/Twitter/status/1263145271946551300', 'url': 'https://video.twimg.com/amplify_video/1263145212760805376/vid/1280x720/9jous8HM0_duxL0w.mp4?tag=13', 'description': 'Testing, testing...\n\nA new way to have a convo with exactly who you want. Were starting with a small % globally, so keep your 👀 out to see it in action. https://t.co/pV53mvjAVT', 'thumbnail': 'https://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg', 'screen_name': 'Twitter', 'type': 'Video', 'images': ['', '', '', '', ''], 'time': 'Wed May 20 16:31:15 +0000 2020', 'qrtURL': None, 'nsfw': False,'verified': True, 'size': {'width': 1920, 'height': 1080}}
testMedia_compare={'tweet': 'https://twitter.com/Twitter/status/1118295916874739714', 'url': '', 'description': 'On profile pages, we used to only show someones replies, not the original Tweet 🙄 Now were showing both so you can follow the conversation more easily! https://t.co/LSBEZYFqmY', 'thumbnail': 'https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg', 'screen_name': 'Twitter', 'type': 'Image', 'images': ['https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg', '', '', '', '1'], 'time': 'Tue Apr 16 23:31:38 +0000 2019', 'qrtURL': None, 'nsfw': False, 'size': {}} testMedia_compare={'tweet': 'https://twitter.com/Twitter/status/1118295916874739714', 'url': '', 'description': 'On profile pages, we used to only show someones replies, not the original Tweet 🙄 Now were showing both so you can follow the conversation more easily! https://t.co/LSBEZYFqmY', 'thumbnail': 'https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg', 'screen_name': 'Twitter', 'type': 'Image', 'images': ['https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg', '', '', '', '1'], 'time': 'Tue Apr 16 23:31:38 +0000 2019', 'qrtURL': None, 'nsfw': False, 'size': {}}
testMultiMedia_compare={'tweet': 'https://twitter.com/Twitter/status/1293239745695211520', 'url': '', 'description': 'We tested, you Tweeted, and now were rolling it out to everyone! https://t.co/w6Q3Q6DiKz', 'thumbnail': 'https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg', 'screen_name': 'Twitter', 'type': 'Image', 'images': ['https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg', 'https://pbs.twimg.com/media/EfJ-aHlU0AAU1kq.jpg', '', '', '2'], 'time': 'Tue Aug 11 17:35:57 +0000 2020', 'qrtURL': None, 'nsfw': False, 'verified': True, 'size': {}} testMultiMedia_compare={'tweet': 'https://twitter.com/Twitter/status/1293239745695211520', 'url': '', 'description': 'We tested, you Tweeted, and now were rolling it out to everyone! https://t.co/w6Q3Q6DiKz', 'thumbnail': 'https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg', 'screen_name': 'Twitter', 'type': 'Image', 'images': ['https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg', 'https://pbs.twimg.com/media/EfJ-aHlU0AAU1kq.jpg', '', '', '2'], 'time': 'Tue Aug 11 17:35:57 +0000 2020', 'qrtURL': None, 'nsfw': False, 'verified': True, 'size': {}}
@ -34,7 +34,7 @@ def compareDict(original,compare):
for key in original: for key in original:
assert key in compare assert key in compare
if type(compare[key]) is not dict: if type(compare[key]) is not dict:
if key == 'verified' and compare[key]!=original[key]: if (key == 'verified' or key== 'time') and compare[key]!=original[key]:
continue # does not match as test data was from before verification changes continue # does not match as test data was from before verification changes
assert compare[key]==original[key] assert compare[key]==original[key]
else: else:
@ -46,7 +46,7 @@ def test_textTweetExtract():
assert tweet["full_text"]==textVNF_compare['description'] assert tweet["full_text"]==textVNF_compare['description']
assert tweet["user"]["screen_name"]=="jack" assert tweet["user"]["screen_name"]=="jack"
assert 'extended_entities' not in tweet assert 'extended_entities' not in tweet
assert tweet["is_quote_status"]==False
def test_UserExtract(): def test_UserExtract():
user = twExtract.extractUser(testUser) user = twExtract.extractUser(testUser)
@ -76,7 +76,7 @@ def test_videoTweetExtract():
video = tweet['extended_entities']["media"][0] video = tweet['extended_entities']["media"][0]
assert video["media_url_https"]=="https://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg" assert video["media_url_https"]=="https://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg"
assert video["type"]=="video" assert video["type"]=="video"
assert tweet["is_quote_status"]==False
def test_mediaTweetExtract(): def test_mediaTweetExtract():
tweet = twExtract.extractStatus(testMediaTweet) tweet = twExtract.extractStatus(testMediaTweet)
@ -87,7 +87,7 @@ def test_mediaTweetExtract():
video = tweet['extended_entities']["media"][0] video = tweet['extended_entities']["media"][0]
assert video["media_url_https"]=="https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg" assert video["media_url_https"]=="https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg"
assert video["type"]=="photo" assert video["type"]=="photo"
assert tweet["is_quote_status"]==False
def test_multimediaTweetExtract(): def test_multimediaTweetExtract():
tweet = twExtract.extractStatus(testMultiMediaTweet) tweet = twExtract.extractStatus(testMultiMediaTweet)
@ -208,11 +208,11 @@ def test_veryLongEmbed():
assert resp.status_code==200 assert resp.status_code==200
def test_embedFromOutdatedCache(): # presets a cache that has VNF's with missing fields; there's probably a better way to do this def test_embedFromOutdatedCache(): # presets a cache that has VNF's with missing fields; there's probably a better way to do this
cache.setCache({"https://twitter.com/Twitter/status/1118295916874739714":{"description":"On profile pages, we used to only show someones replies, not the original Tweet 🙄 Now were showing both so you can follow the conversation more easily! https://t.co/LSBEZYFqmY","hits":0,"images":["https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg","","","","1"],"likes":5033,"nsfw":False,"pfp":"http://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":754,"screen_name":"Twitter","thumbnail":"https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg","time":"Tue Apr 16 23:31:38 +0000 2019","tweet":"https://twitter.com/Twitter/status/1118295916874739714","type":"Image","uploader":"Twitter","url":""}, cache.setCache({"https://twitter.com/Twitter/status/1118295916874739714":{"description":"On profile pages, we used to only show someones replies, not the original Tweet 🙄 Now were showing both so you can follow the conversation more easily! https://t.co/LSBEZYFqmY","hits":0,"images":["https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg","","","","1"],"likes":5033,"nsfw":False,"pfp":"https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":754,"screen_name":"Twitter","thumbnail":"https://pbs.twimg.com/media/D4TS4xeX4AA02DI.jpg","time":"Tue Apr 16 23:31:38 +0000 2019","tweet":"https://twitter.com/Twitter/status/1118295916874739714","type":"Image","uploader":"Twitter","url":""},
"https://twitter.com/Twitter/status/1263145271946551300":{"description":"Testing, testing...\n\nA new way to have a convo with exactly who you want. Were starting with a small % globally, so keep your 👀 out to see it in action. https://t.co/pV53mvjAVT","hits":0,"images":["","","","",""],"likes":61584,"nsfw":False,"pfp":"http://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":17138,"screen_name":"Twitter","thumbnail":"http://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg","time":"Wed May 20 16:31:15 +0000 2020","tweet":"https://twitter.com/Twitter/status/1263145271946551300","type":"Video","uploader":"Twitter","url":"https://video.twimg.com/amplify_video/1263145212760805376/vid/1280x720/9jous8HM0_duxL0w.mp4?tag=13"}, "https://twitter.com/Twitter/status/1263145271946551300":{"description":"Testing, testing...\n\nA new way to have a convo with exactly who you want. Were starting with a small % globally, so keep your 👀 out to see it in action. https://t.co/pV53mvjAVT","hits":0,"images":["","","","",""],"likes":61584,"nsfw":False,"pfp":"https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":17138,"screen_name":"Twitter","thumbnail":"https://pbs.twimg.com/media/EYeX7akWsAIP1_1.jpg","time":"Wed May 20 16:31:15 +0000 2020","tweet":"https://twitter.com/Twitter/status/1263145271946551300","type":"Video","uploader":"Twitter","url":"https://video.twimg.com/amplify_video/1263145212760805376/vid/1280x720/9jous8HM0_duxL0w.mp4?tag=13"},
"https://twitter.com/Twitter/status/1293239745695211520":{"description":"We tested, you Tweeted, and now were rolling it out to everyone! https://t.co/w6Q3Q6DiKz","hits":0,"images":["https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg","https://pbs.twimg.com/media/EfJ-aHlU0AAU1kq.jpg","","","2"],"likes":5707,"nsfw":False,"pfp":"http://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":1416,"screen_name":"Twitter","thumbnail":"https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg","time":"Tue Aug 11 17:35:57 +0000 2020","tweet":"https://twitter.com/Twitter/status/1293239745695211520","type":"Image","uploader":"Twitter","url":""}, "https://twitter.com/Twitter/status/1293239745695211520":{"description":"We tested, you Tweeted, and now were rolling it out to everyone! https://t.co/w6Q3Q6DiKz","hits":0,"images":["https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg","https://pbs.twimg.com/media/EfJ-aHlU0AAU1kq.jpg","","","2"],"likes":5707,"nsfw":False,"pfp":"https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg","qrt":{},"rts":1416,"screen_name":"Twitter","thumbnail":"https://pbs.twimg.com/media/EfJ-C-JU0AAQL_C.jpg","time":"Tue Aug 11 17:35:57 +0000 2020","tweet":"https://twitter.com/Twitter/status/1293239745695211520","type":"Image","uploader":"Twitter","url":""},
"https://twitter.com/jack/status/20":{"description":"just setting up my twttr","hits":0,"images":["","","","",""],"likes":179863,"nsfw":False,"pfp":"http://pbs.twimg.com/profile_images/1115644092329758721/AFjOr-K8_normal.jpg","qrt":{},"rts":122021,"screen_name":"jack","thumbnail":"","time":"Tue Mar 21 20:50:14 +0000 2006","tweet":"https://twitter.com/jack/status/20","type":"Text","uploader":"jack","url":""}, "https://twitter.com/jack/status/20":{"description":"just setting up my twttr","hits":0,"images":["","","","",""],"likes":179863,"nsfw":False,"pfp":"https://pbs.twimg.com/profile_images/1115644092329758721/AFjOr-K8_normal.jpg","qrt":{},"rts":122021,"screen_name":"jack","thumbnail":"","time":"Tue Mar 21 20:50:14 +0000 2006","tweet":"https://twitter.com/jack/status/20","type":"Text","uploader":"jack","url":""},
testQrtVideoTweet:{'tweet': 'https://twitter.com/Twitter/status/1494436688554344449', 'url': '', 'description': 'https://twitter.com/TwitterSupport/status/1494386367467593737', 'thumbnail': '', 'uploader': 'Twitter', 'screen_name': 'Twitter', 'pfp': 'http://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg', 'type': 'Text', 'images': ['', '', '', '', ''], 'likes': 5186, 'rts': 703, 'time': 'Thu Feb 17 22:20:46 +0000 2022', 'qrt': {'desc': 'Keep your fave DM convos easily accessible by pinning them! You can now pin up to six conversations that will stay at the top of your DM inbox.\n\nAvailable on Android, iOS, and web. https://t.co/kIjlzf9XLJ', 'handle': 'Twitter Support', 'screen_name': 'TwitterSupport', 'verified': True, 'id': '1494386367467593737'}, 'nsfw': False, 'verified': True, 'size': {}} testQrtVideoTweet:{'tweet': 'https://twitter.com/Twitter/status/1494436688554344449', 'url': '', 'description': 'https://twitter.com/TwitterSupport/status/1494386367467593737', 'thumbnail': '', 'uploader': 'Twitter', 'screen_name': 'Twitter', 'pfp': 'https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_normal.jpg', 'type': 'Text', 'images': ['', '', '', '', ''], 'likes': 5186, 'rts': 703, 'time': 'Thu Feb 17 22:20:46 +0000 2022', 'qrt': {'desc': 'Keep your fave DM convos easily accessible by pinning them! You can now pin up to six conversations that will stay at the top of your DM inbox.\n\nAvailable on Android, iOS, and web. https://t.co/kIjlzf9XLJ', 'handle': 'Twitter Support', 'screen_name': 'TwitterSupport', 'verified': True, 'id': '1494386367467593737'}, 'nsfw': False, 'verified': True, 'size': {}}
}) })
#embed time #embed time
resp = client.get(testTextTweet.replace("https://twitter.com",""),headers={"User-Agent":"test"}) resp = client.get(testTextTweet.replace("https://twitter.com",""),headers={"User-Agent":"test"})

View File

@ -20,52 +20,89 @@ def getGuestToken():
guestToken = json.loads(r.text)["guest_token"] guestToken = json.loads(r.text)["guest_token"]
return guestToken return guestToken
def extractStatus_fallback(url): def extractStatus_token(url):
try: # get tweet ID
# get tweet ID m = re.search(pathregex, url)
m = re.search(pathregex, url) if m is None:
if m is None: raise twExtractError.TwExtractError(400, "Extract error")
raise twExtractError.TwExtractError(400, "Extract error") twid = m.group(2)
twid = m.group(2) if config["config"]["workaroundTokens"] == None:
if config["config"]["workaroundTokens"] == None: raise twExtractError.TwExtractError(400, "Extract error (no tokens defined)")
raise twExtractError.TwExtractError(400, "Extract error (no tokens defined)") # get tweet
# get tweet tokens = config["config"]["workaroundTokens"].split(",")
tokens = config["config"]["workaroundTokens"].split(",") for authToken in tokens:
for authToken in tokens: try:
try: csrfToken=str(uuid.uuid4()).replace('-', '')
csrfToken=str(uuid.uuid4()).replace('-', '') tweet = requests.get("https://api.twitter.com/1.1/statuses/show/" + twid + ".json?tweet_mode=extended&cards_platform=Web-12&include_cards=1&include_reply_count=1&include_user_entities=0", headers={"Authorization":bearer,"Cookie":f"auth_token={authToken}; ct0={csrfToken}; ","x-twitter-active-user":"yes","x-twitter-auth-type":"OAuth2Session","x-twitter-client-language":"en","x-csrf-token":csrfToken,"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0"})
tweet = requests.get("https://api.twitter.com/1.1/statuses/show/" + twid + ".json?tweet_mode=extended&cards_platform=Web-12&include_cards=1&include_reply_count=1&include_user_entities=0", headers={"Authorization":bearer,"Cookie":f"auth_token={authToken}; ct0={csrfToken}; ","x-twitter-active-user":"yes","x-twitter-auth-type":"OAuth2Session","x-twitter-client-language":"en","x-csrf-token":csrfToken,"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0"}) output = tweet.json()
output = tweet.json() if "errors" in output:
if "errors" in output: # try another token
# try another token
continue
except Exception as e:
continue continue
return output except Exception as e:
raise twExtractError.TwExtractError(400, "Extract error") continue
except Exception as e: return output
raise twExtractError.TwExtractError(400, "Extract error") raise twExtractError.TwExtractError(400, "Extract error")
def extractStatus_guestToken(url):
# get tweet ID
m = re.search(pathregex, url)
if m is None:
return extractStatus_token(url)
twid = m.group(2)
# get guest token
guestToken = getGuestToken()
# get tweet
tweet = requests.get("https://api.twitter.com/1.1/statuses/show/" + twid + ".json?tweet_mode=extended&cards_platform=Web-12&include_cards=1&include_reply_count=1&include_user_entities=0", headers={"Authorization":bearer, "x-guest-token":guestToken})
output = tweet.json()
if "errors" in output:
# pick the first error and create a twExtractError
error = output["errors"][0]
raise twExtractError.TwExtractError(error["code"], error["message"])
return output
def extractStatus_syndication(url):
# https://github.com/mikf/gallery-dl/blob/46cae04aa3a113c7b6bbee1bb468669564b14ae8/gallery_dl/extractor/twitter.py#L1784
m = re.search(pathregex, url)
if m is None:
return extractStatus_token(url)
twid = m.group(2)
tweet = requests.get("https://cdn.syndication.twimg.com/tweet-result?id=" + twid)
if tweet.status_code == 404:
raise twExtractError.TwExtractError(404, "Tweet not found")
output = tweet.json()
if "errors" in output:
# pick the first error and create a twExtractError
error = output["errors"][0]
raise twExtractError.TwExtractError(error["code"], error["message"])
# change returned data to match the one from the other methods
output['full_text'] = output['text']
output['user']['profile_image_url'] = output['user']['profile_image_url_https']
output['retweet_count']=0
if 'mediaDetails' in output:
output['extended_entities'] = {'media':output['mediaDetails']}
for media in output['extended_entities']['media']:
media['media_url'] = media['media_url_https']
if 'quoted_tweet' in output:
output['quoted_status'] = output['quoted_tweet']
quotedID=output['quoted_tweet']['id_str']
quotedScreenName=output['quoted_tweet']['user']['screen_name']
output['quoted_status_permalink'] = {'expanded':f"https://twitter.com/{quotedScreenName}/status/{quotedID}"}
#output['user']['']
return output
def extractStatus(url): def extractStatus(url):
try: methods=[extractStatus_guestToken,extractStatus_syndication,extractStatus_token]
# get tweet ID for method in methods:
m = re.search(pathregex, url) try:
if m is None: return method(url)
return extractStatus_fallback(url) except twExtractError.TwExtractError as e:
twid = m.group(2) continue
# get guest token raise twExtractError.TwExtractError(400, "Extract error")
guestToken = getGuestToken()
# get tweet
tweet = requests.get("https://api.twitter.com/1.1/statuses/show/" + twid + ".json?tweet_mode=extended&cards_platform=Web-12&include_cards=1&include_reply_count=1&include_user_entities=0", headers={"Authorization":bearer, "x-guest-token":guestToken})
output = tweet.json()
if "errors" in output:
# pick the first error and create a twExtractError
error = output["errors"][0]
raise twExtractError.TwExtractError(error["code"], error["message"])
return output
except Exception as e:
return extractStatus_fallback(url)
def extractUser(url): def extractUser(url):
useId=True useId=True