Commit 2d638de0 authored by Himanshu Dabas's avatar Himanshu Dabas Committed by GitHub

fix for deprecation of v1.1 endpoints (#944)

parent 421a155a
......@@ -309,3 +309,6 @@ def run_as_command():
sys.exit(0)
main()
if __name__ == '__main__':
main()
......@@ -81,3 +81,5 @@ class Config:
TranslateDest: str = "en"
Backoff_exponent: float = 3.0
Min_wait_time: int = 0
Bearer_token: str = None
Guest_token: str = None
......@@ -4,32 +4,39 @@ from json import loads
import logging as logme
class NoMoreTweetsException(Exception):
def __init__(self, msg):
super().__init__(msg)
def Follow(response):
logme.debug(__name__+':Follow')
logme.debug(__name__ + ':Follow')
soup = BeautifulSoup(response, "html.parser")
follow = soup.find_all("td", "info fifty screenname")
cursor = soup.find_all("div", "w-button-more")
try:
cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
except IndexError:
logme.critical(__name__+':Follow:IndexError')
logme.critical(__name__ + ':Follow:IndexError')
return follow, cursor
def Mobile(response):
logme.debug(__name__+':Mobile')
logme.debug(__name__ + ':Mobile')
soup = BeautifulSoup(response, "html.parser")
tweets = soup.find_all("span", "metadata")
max_id = soup.find_all("div", "w-button-more")
try:
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
except Exception as e:
logme.critical(__name__+':Mobile:' + str(e))
logme.critical(__name__ + ':Mobile:' + str(e))
return tweets, max_id
def MobileFav(response):
def MobileFav(response):
soup = BeautifulSoup(response, "html.parser")
tweets = soup.find_all("table", "tweet")
max_id = soup.find_all("div", "w-button-more")
......@@ -40,8 +47,9 @@ def MobileFav(response):
return tweets, max_id
def profile(response):
logme.debug(__name__+':profile')
logme.debug(__name__ + ':profile')
json_response = loads(response)
html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser")
......@@ -49,10 +57,54 @@ def profile(response):
return feed, feed[-1]["data-item-id"]
def Json(response):
logme.debug(__name__+':Json')
logme.debug(__name__ + ':Json')
json_response = loads(response)
html = json_response["items_html"]
soup = BeautifulSoup(html, "html.parser")
feed = soup.find_all("div", "tweet")
return feed, json_response["min_position"]
def search_v2(response):
# TODO need to implement this
response = loads(response)
if len(response['globalObjects']['tweets']) == 0:
msg = 'No more data. finished scraping!!'
raise NoMoreTweetsException(msg)
# need to modify things at the function call end
# timeline = response['timeline']['instructions'][0]['addEntries']['entries']
feed = []
feed_set = set()
# here we need to remove the quoted and `to-reply` tweets from the list as they may or may not contain the
# for _id in response['globalObjects']['tweets']:
# if 'quoted_status_id_str' in response['globalObjects']['tweets'][_id] or \
# response['globalObjects']['tweets'][_id]['in_reply_to_status_id_str']:
# try:
# feed_set.add(response['globalObjects']['tweets'][_id]['quoted_status_id_str'])
# except KeyError:
# feed_set.add(response['globalObjects']['tweets'][_id]['in_reply_to_status_id_str'])
# i = 1
# for _id in response['globalObjects']['tweets']:
# if _id not in feed_set:
# temp_obj = response['globalObjects']['tweets'][_id]
# temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
# feed.append(temp_obj)
for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
# this will handle the cases when the timeline entry is a tweet
if timeline_entry['entryId'].find('sq-I-t-') == 0:
_id = timeline_entry['content']['item']['content']['tweet']['id']
temp_obj = response['globalObjects']['tweets'][_id]
temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
feed.append(temp_obj)
try:
next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
'operation']['cursor']['value']
except KeyError:
# this is needed because after the first request location of cursor is changed
next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
'cursor']['value']
return feed, next_cursor
......@@ -37,8 +37,9 @@ def Tweet(config, t):
logme.debug(__name__+':Tweet:notFormat')
output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
if t.retweet:
output += "RT "
# TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
# if t.retweet:
# output += "RT "
output += f"<{t.username}> {t.tweet}"
......
......@@ -8,28 +8,40 @@ from fake_useragent import UserAgent
import asyncio
import concurrent.futures
import random
from json import loads
from json import loads, dumps
from aiohttp_socks import ProxyConnector, ProxyType
from urllib.parse import quote
from . import url
from .output import Tweets, Users
from .user import inf
from .token import TokenExpiryException
import logging as logme
httpproxy = None
user_agent_list = [
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/44.0.2403.157 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
......@@ -42,11 +54,19 @@ user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
'CLR 3.5.30729)',
]
# function to convert python `dict` to json and then encode it to be passed in the url as a parameter
# some urls require this format
def dict_to_url(dct):
return quote(dumps(dct))
def get_connector(config):
logme.debug(__name__+':get_connector')
logme.debug(__name__ + ':get_connector')
_connector = None
if config.Proxy_host:
if config.Proxy_host.lower() == "tor":
......@@ -73,82 +93,92 @@ def get_connector(config):
port=config.Proxy_port,
rdns=True)
else:
logme.critical(__name__+':get_connector:proxy-port-type-error')
logme.critical(__name__ + ':get_connector:proxy-port-type-error')
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
sys.exit(1)
else:
if config.Proxy_port or config.Proxy_type:
logme.critical(__name__+':get_connector:proxy-host-arg-error')
logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
sys.exit(1)
return _connector
async def RequestUrl(config, init, headers = []):
logme.debug(__name__+':RequestUrl')
async def RequestUrl(config, init, headers=[]):
logme.debug(__name__ + ':RequestUrl')
_connector = get_connector(config)
_serialQuery = ""
params = []
_url = ""
_headers = {}
# TODO : do this later
if config.Profile:
if config.Profile_full:
logme.debug(__name__+':RequestUrl:Profile_full')
logme.debug(__name__ + ':RequestUrl:Profile_full')
_url = await url.MobileProfile(config.Username, init)
else:
logme.debug(__name__+':RequestUrl:notProfile_full')
logme.debug(__name__ + ':RequestUrl:notProfile_full')
_url = await url.Profile(config.Username, init)
_serialQuery = _url
elif config.TwitterSearch:
logme.debug(__name__+':RequestUrl:TwitterSearch')
logme.debug(__name__ + ':RequestUrl:TwitterSearch')
_url, params, _serialQuery = await url.Search(config, init)
_headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
else:
if config.Following:
logme.debug(__name__+':RequestUrl:Following')
logme.debug(__name__ + ':RequestUrl:Following')
_url = await url.Following(config.Username, init)
elif config.Followers:
logme.debug(__name__+':RequestUrl:Followers')
logme.debug(__name__ + ':RequestUrl:Followers')
_url = await url.Followers(config.Username, init)
else:
logme.debug(__name__+':RequestUrl:Favorites')
logme.debug(__name__ + ':RequestUrl:Favorites')
_url = await url.Favorites(config.Username, init)
_serialQuery = _url
response = await Request(_url, params=params, connector=_connector, headers=headers)
response = await Request(_url, params=params, connector=_connector, headers=_headers)
if config.Debug:
print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
return response
def ForceNewTorIdentity(config):
logme.debug(__name__+':ForceNewTorIdentity')
logme.debug(__name__ + ':ForceNewTorIdentity')
try:
tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
response = tor_c.recv(1024)
if response != b'250 OK\r\n250 OK\r\n':
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
logme.critical(__name__+':ForceNewTorIdentity:unexpectedResponse')
logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
except Exception as e:
logme.debug(__name__+':ForceNewTorIdentity:errorConnectingTor')
logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
async def Request(url, connector=None, params=[], headers=[]):
logme.debug(__name__+':Request:Connector')
async def Request(_url, connector=None, params=None, headers=None):
logme.debug(__name__ + ':Request:Connector')
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
return await Response(session, url, params)
return await Response(session, _url, params)
async def Response(session, url, params=[]):
logme.debug(__name__+':Response')
async def Response(session, _url, params=None):
logme.debug(__name__ + ':Response')
with timeout(120):
async with session.get(url, ssl=True, params=params, proxy=httpproxy) as response:
return await response.text()
async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
resp = await response.text()
if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
raise TokenExpiryException(loads(resp)['errors'][0]['message'])
return resp
async def RandomUserAgent(wa=None):
logme.debug(__name__+':RandomUserAgent')
logme.debug(__name__ + ':RandomUserAgent')
try:
if wa:
return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
......@@ -156,43 +186,61 @@ async def RandomUserAgent(wa=None):
except:
return random.choice(user_agent_list)
async def Username(_id):
logme.debug(__name__+':Username')
url = f"https://twitter.com/intent/user?user_id={_id}&lang=en"
r = await Request(url, headers={"X-Requested-With": "XMLHttpRequest"})
soup = BeautifulSoup(r, "html.parser")
return soup.find("a", "fn url alternate-context")["href"].replace("/", "")
async def Username(_id, bearer_token, guest_token):
logme.debug(__name__ + ':Username')
_dct = {'userId': _id, 'withHighlightedLabel': False}
_url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
_headers = {
'authorization': bearer_token,
'x-guest-token': guest_token,
}
r = await Request(_url, headers=_headers)
j_r = loads(r)
username = j_r['data']['user']['legacy']['screen_name']
return username
async def Tweet(url, config, conn):
logme.debug(__name__+':Tweet')
logme.debug(__name__ + ':Tweet')
try:
response = await Request(url)
soup = BeautifulSoup(response, "html.parser")
tweets = soup.find_all("div", "tweet")
await Tweets(tweets, config, conn, url)
except Exception as e:
logme.critical(__name__+':Tweet:' + str(e))
logme.critical(__name__ + ':Tweet:' + str(e))
async def User(url, config, conn, user_id = False):
logme.debug(__name__+':User')
_connector = get_connector(config)
async def User(username, config, conn, bearer_token, guest_token, user_id=False):
logme.debug(__name__ + ':User')
_dct = {'screen_name': username, 'withHighlightedLabel': False}
_url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
.format(dict_to_url(_dct))
_headers = {
'authorization': bearer_token,
'x-guest-token': guest_token,
}
try:
response = await Request(url, connector=_connector, headers={"X-Requested-With": "XMLHttpRequest"})
soup = BeautifulSoup(response, "html.parser")
response = await Request(_url, headers=_headers)
j_r = loads(response)
if user_id:
return int(inf(soup, "id"))
await Users(soup, config, conn)
_id = j_r['data']['user']['rest_id']
return _id
await Users(j_r, config, conn)
except Exception as e:
logme.critical(__name__+':User:' + str(e))
logme.critical(__name__ + ':User:' + str(e))
raise
def Limit(Limit, count):
logme.debug(__name__+':Limit')
logme.debug(__name__ + ':Limit')
if Limit is not None and count >= int(Limit):
return True
async def Multi(feed, config, conn):
logme.debug(__name__+':Multi')
logme.debug(__name__ + ':Multi')
count = 0
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
......@@ -201,27 +249,27 @@ async def Multi(feed, config, conn):
for tweet in feed:
count += 1
if config.Favorites or config.Profile_full:
logme.debug(__name__+':Multi:Favorites-profileFull')
logme.debug(__name__ + ':Multi:Favorites-profileFull')
link = tweet.find("a")["href"]
url = f"https://twitter.com{link}&lang=en"
elif config.User_full:
logme.debug(__name__+':Multi:userFull')
logme.debug(__name__ + ':Multi:userFull')
username = tweet.find("a")["name"]
url = f"http://twitter.com/{username}?lang=en"
else:
logme.debug(__name__+':Multi:else-url')
logme.debug(__name__ + ':Multi:else-url')
link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
url = f"https://twitter.com{link}?lang=en"
if config.User_full:
logme.debug(__name__+':Multi:user-full-Run')
logme.debug(__name__ + ':Multi:user-full-Run')
futures.append(loop.run_in_executor(executor, await User(url,
config, conn)))
config, conn)))
else:
logme.debug(__name__+':Multi:notUser-full-Run')
logme.debug(__name__ + ':Multi:notUser-full-Run')
futures.append(loop.run_in_executor(executor, await Tweet(url,
config, conn)))
logme.debug(__name__+':Multi:asyncioGather')
config, conn)))
logme.debug(__name__ + ':Multi:asyncioGather')
await asyncio.gather(*futures)
except Exception as e:
# TODO: fix error not error
......
......@@ -17,19 +17,22 @@ author_list.pop()
# used by Pandas
_follows_object = {}
def _formatDateTime(datetimestamp):
try:
return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
except ValueError:
return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
def _clean_follow_list():
logme.debug(__name__+':clean_follow_list')
logme.debug(__name__ + ':clean_follow_list')
global _follows_object
_follows_object = {}
def clean_lists():
logme.debug(__name__+':clean_lists')
logme.debug(__name__ + ':clean_lists')
global follows_list
global tweets_list
global users_list
......@@ -37,55 +40,61 @@ def clean_lists():
tweets_list = []
users_list = []
def datecheck(datetimestamp, config):
logme.debug(__name__+':datecheck')
logme.debug(__name__ + ':datecheck')
if config.Since:
logme.debug(__name__+':datecheck:SinceTrue')
logme.debug(__name__ + ':datecheck:SinceTrue')
d = _formatDateTime(datetimestamp)
s = _formatDateTime(config.Since)
if d < s:
return False
return False
if config.Until:
logme.debug(__name__+':datecheck:UntilTrue')
logme.debug(__name__ + ':datecheck:UntilTrue')
d = _formatDateTime(datetimestamp)
s = _formatDateTime(config.Until)
if d > s:
return False
logme.debug(__name__+':datecheck:dateRangeFalse')
return False
logme.debug(__name__ + ':datecheck:dateRangeFalse')
return True
# TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
# `tweets` list along with the other tweets
def is_tweet(tw):
try:
tw["data-item-id"]
logme.debug(__name__+':is_tweet:True')
logme.debug(__name__ + ':is_tweet:True')
return True
except:
logme.critical(__name__+':is_tweet:False')
logme.critical(__name__ + ':is_tweet:False')
return False
def _output(obj, output, config, **extra):
logme.debug(__name__+':_output')
logme.debug(__name__ + ':_output')
if config.Lowercase:
if isinstance(obj, str):
logme.debug(__name__+':_output:Lowercase:username')
logme.debug(__name__ + ':_output:Lowercase:username')
obj = obj.lower()
elif obj.__class__.__name__ == "user":
logme.debug(__name__+':_output:Lowercase:user')
logme.debug(__name__ + ':_output:Lowercase:user')
pass
elif obj.__class__.__name__ == "tweet":
logme.debug(__name__+':_output:Lowercase:tweet')
logme.debug(__name__ + ':_output:Lowercase:tweet')
obj.username = obj.username.lower()
author_list.update({obj.username})
for i in range(len(obj.mentions)):
obj.mentions[i] = obj.mentions[i].lower()
for i in range(len(obj.hashtags)):
obj.hashtags[i] = obj.hashtags[i].lower()
for i in range(len(obj.cashtags)):
obj.cashtags[i] = obj.cashtags[i].lower()
# TODO : dont know what cashtags are, <also modify in tweet.py>
# for i in range(len(obj.cashtags)):
# obj.cashtags[i] = obj.cashtags[i].lower()
else:
logme.info('_output:Lowercase:hiddenTweetFound')
print("[x] Hidden tweet found, account suspended due to violation of TOS")
......@@ -94,93 +103,95 @@ def _output(obj, output, config, **extra):
if config.Store_csv:
try:
write.Csv(obj, config)
logme.debug(__name__+':_output:CSV')
logme.debug(__name__ + ':_output:CSV')
except Exception as e:
logme.critical(__name__+':_output:CSV:Error:' + str(e))
logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
print(str(e) + " [x] output._output")
elif config.Store_json:
write.Json(obj, config)
logme.debug(__name__+':_output:JSON')
logme.debug(__name__ + ':_output:JSON')
else:
write.Text(output, config.Output)
logme.debug(__name__+':_output:Text')
logme.debug(__name__ + ':_output:Text')
if config.Elasticsearch:
logme.debug(__name__+':_output:Elasticsearch')
logme.debug(__name__ + ':_output:Elasticsearch')
print("", end=".", flush=True)
else:
if not config.Hide_output:
try:
print(output.replace('\n', ' '))
except UnicodeEncodeError:
logme.critical(__name__+':_output:UnicodeEncodeError')
logme.critical(__name__ + ':_output:UnicodeEncodeError')
print("unicode error [x] output._output")
async def checkData(tweet, config, conn):
logme.debug(__name__+':checkData')
copyright = tweet.find("div", "StreamItemContent--withheld")
if copyright is None and is_tweet(tweet):
tweet = Tweet(tweet, config)
logme.debug(__name__ + ':checkData')
if not tweet.datestamp:
logme.critical(__name__+':checkData:hiddenTweetFound')
print("[x] Hidden tweet found, account suspended due to violation of TOS")
return
tweet = Tweet(tweet, config)
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
output = format.Tweet(config, tweet)
if not tweet.datestamp:
logme.critical(__name__ + ':checkData:hiddenTweetFound')
print("[x] Hidden tweet found, account suspended due to violation of TOS")
return
if config.Database:
logme.debug(__name__+':checkData:Database')
db.tweets(conn, tweet, config)
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
output = format.Tweet(config, tweet)
if config.Pandas:
logme.debug(__name__+':checkData:Pandas')
panda.update(tweet, config)
if config.Database:
logme.debug(__name__ + ':checkData:Database')
db.tweets(conn, tweet, config)
if config.Store_object:
logme.debug(__name__+':checkData:Store_object')
if hasattr(config.Store_object_tweets_list, 'append'):
config.Store_object_tweets_list.append(tweet)
else:
tweets_list.append(tweet)
if config.Pandas:
logme.debug(__name__ + ':checkData:Pandas')
panda.update(tweet, config)
if config.Elasticsearch:
logme.debug(__name__+':checkData:Elasticsearch')
elasticsearch.Tweet(tweet, config)
if config.Store_object:
logme.debug(__name__ + ':checkData:Store_object')
if hasattr(config.Store_object_tweets_list, 'append'):
config.Store_object_tweets_list.append(tweet)
else:
tweets_list.append(tweet)
if config.Elasticsearch:
logme.debug(__name__ + ':checkData:Elasticsearch')
elasticsearch.Tweet(tweet, config)
_output(tweet, output, config)
# else:
# logme.critical(__name__+':checkData:copyrightedTweet')
_output(tweet, output, config)
else:
logme.critical(__name__+':checkData:copyrightedTweet')
async def Tweets(tweets, config, conn, url=''):
logme.debug(__name__+':Tweets')
logme.debug(__name__ + ':Tweets')
if config.Favorites or config.Profile_full or config.Location:
logme.debug(__name__+':Tweets:fav+full+loc')
logme.debug(__name__ + ':Tweets:fav+full+loc')
for tw in tweets:
if tw['data-item-id'] == url.split('?')[0].split('/')[-1]:
await checkData(tw, config, conn)
elif config.TwitterSearch:
logme.debug(__name__+':Tweets:TwitterSearch')
logme.debug(__name__ + ':Tweets:TwitterSearch')
await checkData(tweets, config, conn)
else:
logme.debug(__name__+':Tweets:else')
logme.debug(__name__ + ':Tweets:else')
if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
await checkData(tweets, config, conn)
async def Users(u, config, conn):
logme.debug(__name__+':User')
logme.debug(__name__ + ':User')
global users_list
user = User(u)
output = format.User(config.Format, user)
if config.Database:
logme.debug(__name__+':User:Database')
logme.debug(__name__ + ':User:Database')
db.user(conn, config, user)
if config.Elasticsearch:
logme.debug(__name__+':User:Elasticsearch')
logme.debug(__name__ + ':User:Elasticsearch')
_save_date = user.join_date
_save_time = user.join_time
user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
......@@ -190,49 +201,50 @@ async def Users(u, config, conn):
user.join_time = _save_time
if config.Store_object:
logme.debug(__name__+':User:Store_object')
logme.debug(__name__ + ':User:Store_object')
if hasattr(config.Store_object_follow_list, 'append'):
config.Store_object_follow_list.append(user)
elif hasattr(config.Store_object_users_list, 'append'):
config.Store_object_users_list.append(user)
else:
users_list.append(user) # twint.user.user
users_list.append(user) # twint.user.user
if config.Pandas:
logme.debug(__name__+':User:Pandas+user')
logme.debug(__name__ + ':User:Pandas+user')
panda.update(user, config)
_output(user, output, config)
async def Username(username, config, conn):
logme.debug(__name__+':Username')
logme.debug(__name__ + ':Username')
global _follows_object
global follows_list
follow_var = config.Following*"following" + config.Followers*"followers"
follow_var = config.Following * "following" + config.Followers * "followers"
if config.Database:
logme.debug(__name__+':Username:Database')
logme.debug(__name__ + ':Username:Database')
db.follow(conn, config.Username, config.Followers, username)
if config.Elasticsearch:
logme.debug(__name__+':Username:Elasticsearch')
logme.debug(__name__ + ':Username:Elasticsearch')
elasticsearch.Follow(username, config)
if config.Store_object:
if hasattr(config.Store_object_follow_list, 'append'):
config.Store_object_follow_list.append(username)
else:
follows_list.append(username) # twint.user.user
follows_list.append(username) # twint.user.user
if config.Pandas:
logme.debug(__name__+':Username:object+pandas')
logme.debug(__name__ + ':Username:object+pandas')
try:
_ = _follows_object[config.Username][follow_var]
except KeyError:
_follows_object.update({config.Username: {follow_var: []}})
_follows_object[config.Username][follow_var].append(username)
if config.Pandas_au:
logme.debug(__name__+':Username:object+pandas+au')
logme.debug(__name__ + ':Username:object+pandas+au')
panda.update(_follows_object[config.Username], config)
_output(username, username, config)
import sys, os, time, datetime
import sys, os, datetime
from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
from . import datelock, feed, get, output, verbose, storage
from .token import TokenExpiryException
from . import token
from .storage import db
from .feed import NoMoreTweetsException
import logging as logme
import time
bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
'%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
class Twint:
def __init__(self, config):
logme.debug(__name__+':Twint:__init__')
logme.debug(__name__ + ':Twint:__init__')
if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
logme.debug(__name__+':Twint:__init__:Resume')
logme.debug(__name__ + ':Twint:__init__:Resume')
self.init = self.get_resume(config.Resume)
else:
self.init = '-1'
......@@ -21,16 +28,21 @@ class Twint:
self.count = 0
self.user_agent = ""
self.config = config
self.config.Bearer_token = bearer
# TODO might have to make some adjustments for it to work with multi-treading
# USAGE : to get a new guest token simply do `self.token.refresh()`
self.token = token.Token(config)
self.token.refresh()
self.conn = db.Conn(config.Database)
self.d = datelock.Set(self.config.Until, self.config.Since)
verbose.Elastic(config.Elasticsearch)
if self.config.Store_object:
logme.debug(__name__+':Twint:__init__:clean_follow_list')
logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
output._clean_follow_list()
if self.config.Pandas_clean:
logme.debug(__name__+':Twint:__init__:pandas_clean')
logme.debug(__name__ + ':Twint:__init__:pandas_clean')
storage.panda.clean()
def get_resume(self, resumeFile):
......@@ -41,10 +53,17 @@ class Twint:
return _init
async def Feed(self):
logme.debug(__name__+':Twint:Feed')
logme.debug(__name__ + ':Twint:Feed')
consecutive_errors_count = 0
while True:
response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)])
# this will receive a JSON string, parse it into a `dict` and do the required stuff
try:
response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)])
except TokenExpiryException as e:
logme.debug(__name__ + 'Twint:Feed:' + str(e))
self.token.refresh()
response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)])
if self.config.Debug:
print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
......@@ -75,29 +94,36 @@ class Twint:
else:
self.feed, self.init = feed.profile(response)
elif self.config.TwitterSearch:
self.feed, self.init = feed.Json(response)
try:
self.feed, self.init = feed.search_v2(response)
except NoMoreTweetsException as e:
logme.debug(__name__ + ':Twint:Feed:' + str(e))
print(e, 'is it though? because sometimes twitter lie.')
break
except TimeoutError as e:
if self.config.Proxy_host.lower() == "tor":
print("[?] Timed out, changing Tor identity...")
if self.config.Tor_control_password is None:
logme.critical(__name__+':Twint:Feed:tor-password')
logme.critical(__name__ + ':Twint:Feed:tor-password')
sys.stderr.write("Error: config.Tor_control_password must be set for proxy autorotation!\r\n")
sys.stderr.write("Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly\r\n")
sys.stderr.write(
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly\r\n")
break
else:
get.ForceNewTorIdentity(self.config)
continue
else:
logme.critical(__name__+':Twint:Feed:' + str(e))
logme.critical(__name__ + ':Twint:Feed:' + str(e))
print(str(e))
break
except Exception as e:
if self.config.Profile or self.config.Favorites:
print("[!] Twitter does not return more data, scrape stops here.")
break
logme.critical(__name__+':Twint:Feed:noData' + str(e))
logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
# Sometimes Twitter says there is no data. But it's a lie.
# raise
consecutive_errors_count += 1
if consecutive_errors_count < self.config.Retries_count:
# skip to the next iteration if wait time does not satisfy limit constraints
......@@ -111,9 +137,10 @@ class Twint:
time.sleep(delay)
self.user_agent = await get.RandomUserAgent(wa=True)
continue
logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e))
logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
sys.stderr.write(str(e) + " [x] run.Feed")
sys.stderr.write("[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!")
sys.stderr.write(
"[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!")
break
if self.config.Resume:
print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
......@@ -121,17 +148,17 @@ class Twint:
async def follow(self):
await self.Feed()
if self.config.User_full:
logme.debug(__name__+':Twint:follow:userFull')
logme.debug(__name__ + ':Twint:follow:userFull')
self.count += await get.Multi(self.feed, self.config, self.conn)
else:
logme.debug(__name__+':Twint:follow:notUserFull')
logme.debug(__name__ + ':Twint:follow:notUserFull')
for user in self.feed:
self.count += 1
username = user.find("a")["name"]
await output.Username(username, self.config, self.conn)
async def favorite(self):
logme.debug(__name__+':Twint:favorite')
logme.debug(__name__ + ':Twint:favorite')
await self.Feed()
favorited_tweets_list = []
for tweet in self.feed:
......@@ -182,21 +209,22 @@ class Twint:
async def profile(self):
await self.Feed()
if self.config.Profile_full:
logme.debug(__name__+':Twint:profileFull')
logme.debug(__name__ + ':Twint:profileFull')
self.count += await get.Multi(self.feed, self.config, self.conn)
else:
logme.debug(__name__+':Twint:notProfileFull')
logme.debug(__name__ + ':Twint:notProfileFull')
for tweet in self.feed:
self.count += 1
await output.Tweets(tweet, self.config, self.conn)
async def tweets(self):
await self.Feed()
# TODO : need to take care of this later
if self.config.Location:
logme.debug(__name__+':Twint:tweets:location')
logme.debug(__name__ + ':Twint:tweets:location')
self.count += await get.Multi(self.feed, self.config, self.conn)
else:
logme.debug(__name__+':Twint:tweets:notLocation')
logme.debug(__name__ + ':Twint:tweets:notLocation')
for tweet in self.feed:
self.count += 1
await output.Tweets(tweet, self.config, self.conn)
......@@ -217,75 +245,82 @@ class Twint:
self.user_agent = await get.RandomUserAgent()
if self.config.User_id is not None and self.config.Username is None:
logme.debug(__name__+':Twint:main:user_id')
self.config.Username = await get.Username(self.config.User_id)
logme.debug(__name__ + ':Twint:main:user_id')
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
self.config.Guest_token)
if self.config.Username is not None and self.config.User_id is None:
logme.debug(__name__+':Twint:main:username')
url = f"https://twitter.com/{self.config.Username}?lang=en"
self.config.User_id = await get.User(url, self.config, self.conn, True)
logme.debug(__name__ + ':Twint:main:username')
self.config.User_id = await get.User(self.config.Username, self.config, self.conn,
self.config.Bearer_token,
self.config.Guest_token, True)
if self.config.User_id is None:
raise ValueError("Cannot find twitter account with name = " + self.config.Username)
# TODO : will need to modify it to work with the new endpoints
if self.config.TwitterSearch and self.config.Since and self.config.Until:
logme.debug(__name__+':Twint:main:search+since+until')
logme.debug(__name__ + ':Twint:main:search+since+until')
while self.d._since < self.d._until:
self.config.Since = str(self.d._since)
self.config.Until = str(self.d._until)
if len(self.feed) > 0:
await self.tweets()
else:
logme.debug(__name__+':Twint:main:gettingNewTweets')
logme.debug(__name__ + ':Twint:main:gettingNewTweets')
break
if get.Limit(self.config.Limit, self.count):
break
else:
logme.debug(__name__+':Twint:main:not-search+since+until')
logme.debug(__name__ + ':Twint:main:not-search+since+until')
while True:
if len(self.feed) > 0:
if self.config.Followers or self.config.Following:
logme.debug(__name__+':Twint:main:follow')
logme.debug(__name__ + ':Twint:main:follow')
await self.follow()
elif self.config.Favorites:
logme.debug(__name__+':Twint:main:favorites')
logme.debug(__name__ + ':Twint:main:favorites')
await self.favorite()
elif self.config.Profile:
logme.debug(__name__+':Twint:main:profile')
logme.debug(__name__ + ':Twint:main:profile')
await self.profile()
elif self.config.TwitterSearch:
logme.debug(__name__+':Twint:main:twitter-search')
logme.debug(__name__ + ':Twint:main:twitter-search')
await self.tweets()
else:
logme.debug(__name__+':Twint:main:no-more-tweets')
logme.debug(__name__ + ':Twint:main:no-more-tweets')
break
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
# logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
if get.Limit(self.config.Limit, self.count):
logme.debug(__name__+':Twint:main:reachedLimit')
logme.debug(__name__ + ':Twint:main:reachedLimit')
break
if self.config.Count:
verbose.Count(self.count, self.config)
def run(config, callback=None):
logme.debug(__name__+':run')
logme.debug(__name__ + ':run')
try:
get_event_loop()
except RuntimeError as e:
if "no current event loop" in str(e):
set_event_loop(new_event_loop())
else:
logme.exception(__name__+':Lookup:Unexpected exception while handling an expected RuntimeError.')
logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
raise
except Exception as e:
logme.exception(__name__+':Lookup:Unexpected exception occured while attempting to get or create a new event loop.')
logme.exception(
__name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
raise
get_event_loop().run_until_complete(Twint(config).main(callback))
def Favorites(config):
logme.debug(__name__+':Favorites')
logme.debug(__name__ + ':Favorites')
config.Favorites = True
config.Following = False
config.Followers = False
......@@ -296,8 +331,9 @@ def Favorites(config):
if config.Pandas_au:
storage.panda._autoget("tweet")
def Followers(config):
logme.debug(__name__+':Followers')
logme.debug(__name__ + ':Followers')
config.Followers = True
config.Following = False
config.Profile = False
......@@ -310,11 +346,12 @@ def Followers(config):
if config.User_full:
storage.panda._autoget("user")
if config.Pandas_clean and not config.Store_object:
#storage.panda.clean()
# storage.panda.clean()
output._clean_follow_list()
def Following(config):
logme.debug(__name__+':Following')
logme.debug(__name__ + ':Following')
config.Following = True
config.Followers = False
config.Profile = False
......@@ -327,11 +364,12 @@ def Following(config):
if config.User_full:
storage.panda._autoget("user")
if config.Pandas_clean and not config.Store_object:
#storage.panda.clean()
# storage.panda.clean()
output._clean_follow_list()
def Lookup(config):
logme.debug(__name__+':Lookup')
logme.debug(__name__ + ':Lookup')
try:
get_event_loop()
......@@ -339,15 +377,16 @@ def Lookup(config):
if "no current event loop" in str(e):
set_event_loop(new_event_loop())
else:
logme.exception(__name__+':Lookup:Unexpected exception while handling an expected RuntimeError.')
logme.exception(__name__ + ':Lookup:Unexpected exception while handling an expected RuntimeError.')
raise
except Exception as e:
logme.exception(__name__+':Lookup:Unexpected exception occured while attempting to get or create a new event loop.')
logme.exception(
__name__ + ':Lookup:Unexpected exception occured while attempting to get or create a new event loop.')
raise
try:
if config.User_id is not None:
logme.debug(__name__+':Twint:Lookup:user_id')
logme.debug(__name__ + ':Twint:Lookup:user_id')
config.Username = get_event_loop().run_until_complete(get.Username(config.User_id))
url = f"https://mobile.twitter.com/{config.Username}?prefetchTimestamp=" + str(int(time.time() * 1000))
......@@ -357,15 +396,16 @@ def Lookup(config):
storage.panda._autoget("user")
except RuntimeError as e:
if "no current event loop" in str(e):
logme.exception(__name__+':Lookup:Previous attempt to to create an event loop failed.')
logme.exception(__name__ + ':Lookup:Previous attempt to to create an event loop failed.')
raise
except Exception as e:
logme.exception(__name__+':Lookup:Unexpected exception occured.')
logme.exception(__name__ + ':Lookup:Unexpected exception occured.')
raise
def Profile(config):
logme.debug(__name__+':Profile')
logme.debug(__name__ + ':Profile')
config.Profile = True
config.Favorites = False
config.Following = False
......@@ -375,8 +415,9 @@ def Profile(config):
if config.Pandas_au:
storage.panda._autoget("tweet")
def Search(config, callback=None):
logme.debug(__name__+':Search')
logme.debug(__name__ + ':Search')
config.TwitterSearch = True
config.Favorites = False
config.Following = False
......
import re
import time
import requests
import logging as logme
class TokenExpiryException(Exception):
def __init__(self, msg):
super().__init__(msg)
class Token:
def __init__(self, config):
self._session = requests.Session()
self.config = config
self._retries = 5
self._timeout = 10
self.url = 'https://twitter.com'
def _request(self):
for attempt in range(self._retries + 1):
# The request is newly prepared on each retry because of potential cookie updates.
req = self._session.prepare_request(requests.Request('GET', self.url))
logme.debug(f'Retrieving {req.url}')
try:
r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
except requests.exceptions.RequestException as exc:
if attempt < self._retries:
retrying = ', retrying'
level = logme.WARNING
else:
retrying = ''
level = logme.ERROR
logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
else:
success, msg = (True, None)
msg = f': {msg}' if msg else ''
if success:
logme.debug(f'{req.url} retrieved successfully{msg}')
return r
if attempt < self._retries:
# TODO : might wanna tweak this back-off timer
sleep_time = 2.0 * 2 ** attempt
logme.info(f'Waiting {sleep_time:.0f} seconds')
time.sleep(sleep_time)
else:
msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
logme.fatal(msg)
self.config.Guest_token = None
raise RefreshTokenException(msg)
def refresh(self):
logme.debug('Retrieving guest token')
res = self._request()
match = re.search(r'\("gt=(\d+);', res.text)
if match:
logme.debug('Found guest token in HTML')
self.config.Guest_token = str(match.group(1))
else:
self.config.Guest_token = None
raise RefreshTokenException('Could not find the Guest token in HTML')
from time import strftime, localtime
from datetime import datetime
from datetime import datetime, timezone
import json
import logging as logme
......@@ -9,6 +9,7 @@ from googletransx import Translator
# - https://github.com/x0rzkov/py-googletrans#basic-usage
translator = Translator()
class tweet:
"""Define Tweet class
"""
......@@ -17,52 +18,63 @@ class tweet:
def __init__(self):
pass
def utc_to_local(utc_dt):
return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
def getMentions(tw):
"""Extract ment from tweet
"""Extract mentions from tweet
"""
logme.debug(__name__+':getMentions')
logme.debug(__name__ + ':getMentions')
mentions = []
try:
mentions = tw["data-mentions"].split(" ")
except:
for mention in tw['entities']['user_mentions']:
mentions.append(mention['screen_name'])
except KeyError:
mentions = []
return mentions
def getQuoteURL(tw):
"""Extract quote from tweet
"""
logme.debug(__name__+':getQuoteURL')
logme.debug(__name__ + ':getQuoteURL')
base_twitter = "https://twitter.com"
quote_url = ""
try:
quote = tw.find("div","QuoteTweet-innerContainer")
quote = tw.find("div", "QuoteTweet-innerContainer")
quote_url = base_twitter + quote.get("href")
except:
quote_url = ""
return quote_url
def getText(tw):
"""Replace some text
"""
logme.debug(__name__+':getText')
text = tw.find("p", "tweet-text").text
text = text.replace("http", " http")
text = text.replace("pic.twitter", " pic.twitter")
return text
# def getText(tw):
# """Replace some text
# """
# logme.debug(__name__ + ':getText')
# text = tw.find("p", "tweet-text").text
# text = text.replace("http", " http")
# text = text.replace("pic.twitter", " pic.twitter")
#
# return text
def getStat(tw, _type):
"""Get stats about Tweet
"""
logme.debug(__name__+':getStat')
logme.debug(__name__ + ':getStat')
st = f"ProfileTweet-action--{_type} u-hiddenVisually"
return tw.find("span", st).find("span")["data-tweet-stat-count"]
def getRetweet(tw, _config):
"""Get Retweet
"""
logme.debug(__name__+':getRetweet')
logme.debug(__name__ + ':getRetweet')
if _config.Profile:
if int(tw["data-user-id"]) != _config.User_id:
return _config.User_id, _config.Username
......@@ -71,63 +83,159 @@ def getRetweet(tw, _config):
if _rt_object:
_rt_id = _rt_object.find('a')['data-user-id']
_rt_username = _rt_object.find('a')['href'][1:]
return _rt_id, _rt_username
return _rt_id, _rt_username
return '', ''
def getThumbnail(tw):
"""Get Thumbnail
"""
divs = tw.find_all("div","PlayableMedia-player")
thumb = ""
for div in divs:
thumb = div.attrs["style"].split("url('")[-1]
thumb = thumb.replace("')","")
return thumb
# def getThumbnail(tw):
# """Get Thumbnail
# """
# divs = tw.find_all("div", "PlayableMedia-player")
# thumb = ""
# for div in divs:
# thumb = div.attrs["style"].split("url('")[-1]
# thumb = thumb.replace("')", "")
# return thumb
# def Tweet(tw, config):
# """Create Tweet object
# """
# logme.debug(__name__+':Tweet')
# t = tweet()
# t.id = int(tw["data-item-id"])
# t.id_str = tw["data-item-id"]
# t.conversation_id = tw["data-conversation-id"]
# t.datetime = int(tw.find("span", "_timestamp")["data-time-ms"])
# t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime/1000.0))
# t.timestamp = strftime("%H:%M:%S", localtime(t.datetime/1000.0))
# t.user_id = int(tw["data-user-id"])
# t.user_id_str = tw["data-user-id"]
# t.username = tw["data-screen-name"]
# t.name = tw["data-name"]
# t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else ""
# t.timezone = strftime("%z", localtime())
# for img in tw.findAll("img", "Emoji Emoji--forText"):
# img.replaceWith(img["alt"])
# t.mentions = getMentions(tw)
# t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
# t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
# t.video = 1 if tw.find_all("div", "AdaptiveMedia-video") != [] else 0
# t.thumbnail = getThumbnail(tw)
# t.tweet = getText(tw)
# t.lang = tw.find('p', 'tweet-text')['lang']
# t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
# t.replies_count = getStat(tw, "reply")
# t.retweets_count = getStat(tw, "retweet")
# t.likes_count = getStat(tw, "favorite")
# t.link = f"https://twitter.com/{t.username}/status/{t.id}"
# t.user_rt_id, t.user_rt = getRetweet(tw, config)
# t.retweet = True if t.user_rt else False
# t.retweet_id = ''
# t.retweet_date = ''
# if not config.Profile:
# t.retweet_id = tw['data-retweet-id'] if t.user_rt else ''
# t.retweet_date = datetime.fromtimestamp(((int(t.retweet_id) >> 22) + 1288834974657)/1000.0).strftime("%Y-%m-%d %H:%M:%S") if t.user_rt else ''
# t.quote_url = getQuoteURL(tw)
# t.near = config.Near if config.Near else ""
# t.geo = config.Geo if config.Geo else ""
# t.source = config.Source if config.Source else ""
# t.reply_to = [{'user_id': t['id_str'], 'username': t['screen_name']} for t in json.loads(tw["data-reply-to-users-json"])]
# t.translate = ''
# t.trans_src = ''
# t.trans_dest = ''
# if config.Translate == True:
# try:
# ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
# t.translate = ts.text
# t.trans_src = ts.src
# t.trans_dest = ts.dest
# # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
# except ValueError as e:
# raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
# logme.debug(__name__+':Tweet:translator.translate:'+str(e))
# return t
def Tweet(tw, config):
"""Create Tweet object
"""
logme.debug(__name__+':Tweet')
logme.debug(__name__ + ':Tweet')
t = tweet()
t.id = int(tw["data-item-id"])
t.id_str = tw["data-item-id"]
t.conversation_id = tw["data-conversation-id"]
t.datetime = int(tw.find("span", "_timestamp")["data-time-ms"])
t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime/1000.0))
t.timestamp = strftime("%H:%M:%S", localtime(t.datetime/1000.0))
t.user_id = int(tw["data-user-id"])
t.user_id_str = tw["data-user-id"]
t.username = tw["data-screen-name"]
t.name = tw["data-name"]
t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else ""
t.id = int(tw['id_str'])
t.id_str = tw["id_str"]
t.conversation_id = tw["conversation_id_str"]
# parsing date to user-friendly format
_dt = tw['created_at']
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
_dt = utc_to_local(_dt)
t.datetime = str(_dt.strftime('%d-%m-%Y %H:%M:%S %Z'))
# date is of the format year,
t.datestamp = _dt.strftime('%d-%m-%Y')
t.timestamp = _dt.strftime('%H:%M:%S')
t.user_id = int(tw["user_id_str"])
t.user_id_str = tw["user_id_str"]
t.username = tw["user_data"]['screen_name']
t.name = tw["user_data"]['name']
t.place = tw['geo'] if tw['geo'] else ""
t.timezone = strftime("%z", localtime())
for img in tw.findAll("img", "Emoji Emoji--forText"):
img.replaceWith(img["alt"])
t.mentions = getMentions(tw)
t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
t.video = 1 if tw.find_all("div", "AdaptiveMedia-video") != [] else 0
t.thumbnail = getThumbnail(tw)
t.tweet = getText(tw)
t.lang = tw.find('p', 'tweet-text')['lang']
t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
t.replies_count = getStat(tw, "reply")
t.retweets_count = getStat(tw, "retweet")
t.likes_count = getStat(tw, "favorite")
# for img in tw.findAll("img", "Emoji Emoji--forText"):
# img.replaceWith(img["alt"])
try:
t.mentions = [_mention['screen_name'] for _mention in tw['entities']['user_mentions']]
except KeyError:
t.mentions = []
try:
t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
except KeyError:
t.urls = []
try:
t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
_img['expanded_url'].find('/photo/') != -1]
except KeyError:
t.photos = []
try:
t.video = 1 if len(tw['extended_entities']['media']) else 0
except KeyError:
t.video = 0
try:
t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
except KeyError:
t.thumbnail = ''
t.tweet = tw['full_text']
t.lang = tw['lang']
try:
t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
except KeyError:
t.hashtags = []
# don't know what this is
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
t.replies_count = tw['reply_count']
t.retweets_count = tw['retweet_count']
t.likes_count = tw['favorite_count']
t.link = f"https://twitter.com/{t.username}/status/{t.id}"
t.user_rt_id, t.user_rt = getRetweet(tw, config)
t.retweet = True if t.user_rt else False
t.retweet_id = ''
t.retweet_date = ''
if not config.Profile:
t.retweet_id = tw['data-retweet-id'] if t.user_rt else ''
t.retweet_date = datetime.fromtimestamp(((int(t.retweet_id) >> 22) + 1288834974657)/1000.0).strftime("%Y-%m-%d %H:%M:%S") if t.user_rt else ''
t.quote_url = getQuoteURL(tw)
# TODO: someone who is familiar with this code, needs to take a look at what this is
# t.user_rt_id, t.user_rt = getRetweet(tw, config)
# t.retweet = True if t.user_rt else False
# t.retweet_id = ''
# t.retweet_date = ''
# if not config.Profile:
# t.retweet_id = tw['data-retweet-id'] if t.user_rt else ''
# t.retweet_date = datetime.fromtimestamp(((int(t.retweet_id) >> 22) + 1288834974657) / 1000.0).strftime(
# "%Y-%m-%d %H:%M:%S") if t.user_rt else ''
try:
t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
except KeyError:
# means that the quoted tweet have been deleted
t.quote_url = 0
t.near = config.Near if config.Near else ""
t.geo = config.Geo if config.Geo else ""
t.source = config.Source if config.Source else ""
t.reply_to = [{'user_id': t['id_str'], 'username': t['screen_name']} for t in json.loads(tw["data-reply-to-users-json"])]
# TODO: check this whether we need the list of all the users to whom this tweet is a reply or we only need
# the immediately above user id
t.reply_to = {'user_id': tw['in_reply_to_user_id_str'], 'username': tw['in_reply_to_screen_name']}
t.translate = ''
t.trans_src = ''
t.trans_dest = ''
......@@ -140,5 +248,5 @@ def Tweet(tw, config):
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
except ValueError as e:
raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
logme.debug(__name__+':Tweet:translator.translate:'+str(e))
logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
return t
import datetime
from sys import platform
import logging as logme
from urllib.parse import urlencode
from urllib.parse import quote
mobile = "https://mobile.twitter.com"
base = "https://twitter.com/i"
# base = "https://twitter.com/i"
base = "https://api.twitter.com/2/search/adaptive.json"
def _sanitizeQuery(base,params):
def _sanitizeQuery(_url, params):
_serialQuery = ""
for p in params:
_serialQuery += p[0]+"="+p[1]+"&"
_serialQuery = base + "?" + _serialQuery[:-1].replace(":", "%3A").replace(" ", "%20")
_serialQuery = urlencode(params, quote_via=quote)
_serialQuery = _url + "?" + _serialQuery
return _serialQuery
def _formatDate(date):
if "win" in platform:
return f'\"{date.split()[0]}\"'
......@@ -20,8 +24,9 @@ def _formatDate(date):
except ValueError:
return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
async def Favorites(username, init):
logme.debug(__name__+':Favorites')
logme.debug(__name__ + ':Favorites')
url = f"{mobile}/{username}/favorites?lang=en"
if init != '-1':
......@@ -29,8 +34,9 @@ async def Favorites(username, init):
return url
async def Followers(username, init):
logme.debug(__name__+':Followers')
logme.debug(__name__ + ':Followers')
url = f"{mobile}/{username}/followers?lang=en"
if init != '-1':
......@@ -38,8 +44,9 @@ async def Followers(username, init):
return url
async def Following(username, init):
logme.debug(__name__+':Following')
logme.debug(__name__ + ':Following')
url = f"{mobile}/{username}/following?lang=en"
if init != '-1':
......@@ -47,8 +54,9 @@ async def Following(username, init):
return url
async def MobileProfile(username, init):
logme.debug(__name__+':MobileProfile')
logme.debug(__name__ + ':MobileProfile')
url = f"{mobile}/{username}?lang=en"
if init != '-1':
......@@ -56,8 +64,9 @@ async def MobileProfile(username, init):
return url
async def Profile(username, init):
logme.debug(__name__+':Profile')
logme.debug(__name__ + ':Profile')
url = f"{base}/profiles/show/{username}/timeline/tweets?include_"
url += "available_features=1&lang=en&include_entities=1"
url += "&include_new_items_bar=true"
......@@ -67,17 +76,38 @@ async def Profile(username, init):
return url
async def Search(config, init):
logme.debug(__name__+':Search')
url = f"{base}/search/timeline"
logme.debug(__name__ + ':Search')
url = base
tweet_count = 100
q = ""
params = [
('vertical', 'default'),
('src', 'unkn'),
('include_available_features', '1'),
('include_entities', '1'),
('max_position', str(init)),
('reset_error_state', 'false'),
# ('include_blocking', '1'),
# ('include_blocked_by', '1'),
# ('include_followed_by', '1'),
# ('include_want_retweets', '1'),
# ('include_mute_edge', '1'),
# ('include_can_dm', '1'),
('include_can_media_tag', '1'),
# ('skip_status', '1'),
# ('include_cards', '1'),
('include_ext_alt_text', 'true'),
('include_quote_count', 'true'),
('include_reply_count', '1'),
('tweet_mode', 'extended'),
('include_entities', 'true'),
('include_user_entities', 'true'),
('include_ext_media_availability', 'true'),
('send_error_codes', 'true'),
('simple_quoted_tweet', 'true'),
('count', tweet_count),
# ('query_source', 'typed_query'),
# ('pc', '1'),
('cursor', str(init)),
('spelling_corrections', '1'),
('ext', 'mediaStats%2ChighlightedLabel'),
('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
]
if not config.Popular_tweets:
params.append(('f', 'tweets'))
......@@ -92,7 +122,8 @@ async def Search(config, init):
config.Geo = config.Geo.replace(" ", "")
q += f" geocode:{config.Geo}"
if config.Search:
q += f" {config.Search}"
q += f"{config.Search}"
if config.Year:
q += f" until:{config.Year}-1-1"
if config.Since:
......@@ -120,6 +151,7 @@ async def Search(config, init):
q += " filter:media"
if config.Replies:
q += " filter:replies"
# although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
if config.Native_retweets:
q += " filter:nativeretweets"
if config.Min_likes:
......@@ -144,3 +176,43 @@ async def Search(config, init):
params.append(("q", q))
_serialQuery = _sanitizeQuery(url, params)
return url, params, _serialQuery
# maybe dont need this
async def SearchProfile(config, init=None):
logme.debug(__name__ + ':SearchProfile')
_url = 'https://api.twitter.com/2/timeline/profile/{}.json?'
q = ""
params = [
('include_profile_interstitial_type', '1'),
('include_blocking', '1'),
('include_blocked_by', '1'),
('include_followed_by', '1'),
('include_want_retweets', '1'),
('include_mute_edge', '1'),
('include_can_dm', '1'),
('include_can_media_tag', '1'),
('skip_status', '1'),
('cards_platform', 'Web - 12'),
('include_cards', '1'),
('include_ext_alt_text', 'true'),
('include_quote_count', 'true'),
('include_reply_count', '1'),
('tweet_mode', 'extended'),
('include_entities', 'true'),
('include_user_entities', 'true'),
('include_ext_media_color', 'true'),
('include_ext_media_availability', 'true'),
('send_error_codes', 'true'),
('simple_quoted_tweet', 'true'),
('include_tweet_replies', 'false'),
('count', '50'),
('userId', '1934388686'),
('ext', 'mediaStats,ChighlightedLabel'),
]
if init:
params.append(('cursor', init))
_serialQuery = _sanitizeQuery(_url, params)
return _url, params, _serialQuery
pass
import datetime
import logging as logme
class user:
class User:
type = "user"
def __init__(self):
pass
def inf(ur, _type):
logme.debug(__name__+':inf')
try:
group = ur.find("div", "profile")
if group == None:
group = ur.find("div", "user-actions btn-group not-following")
if group == None:
group = ur.find("div", "user-actions btn-group not-following protected")
except Exception as e:
print("Error: " + str(e))
if _type == "id":
screen_name = group.find("span", "screen-name").text
ret = ur.find("a", {"data-screenname": screen_name})
ret = ret.get('data-mentioned-user-id') if ret is not None else None
ret = "" if ret is None else ret
elif _type == "name":
ret = group.find("div", "fullname").text.split('\n')[0]
elif _type == "username":
ret = group.find("span", "screen-name").text
elif _type == "private":
ret = group.find("div","protected")
if ret:
ret = 1
else:
ret = 0
return ret
def card(ur, _type):
logme.debug(__name__+':card')
if _type == "bio":
try:
ret = ur.find("div", "bio").text.replace("\n", " ").strip()
except:
ret = ""
elif _type == "location":
try:
ret = ur.find("div", "location").text
except:
ret = ""
elif _type == "url":
try:
ret = ur.find("link")["href"]
except:
ret = ""
return ret
def join(ur):
try:
logme.debug(__name__+':join')
jd = ur.find("span", "ProfileHeaderCard-joinDateText js-tooltip u-dir")["title"]
return jd.split(" - ")
except:
return ["", ""]
def convertToInt(x):
logme.debug(__name__+':contertToInt')
multDict = {
"k" : 1000,
"m" : 1000000,
"b" : 1000000000,
}
try:
if ',' in x:
x = x.replace(',', '')
y = int(x)
return y
except:
pass
try:
y = float(str(x)[:-1])
y = y * multDict[str(x)[-1:].lower()]
return int(y)
except:
pass
return 0
def stat(ur, _type):
logme.debug(__name__+':stat')
stats = ur.find('table', 'profile-stats')
stat_dict = {}
for stat in stats.find_all('td', 'stat'):
statnum, statlabel = stat.text.replace('\n', '').replace(',', '').split(' ')[:2]
stat_dict[statlabel.lower()] = int(statnum.replace(',', ''))
try :
return stat_dict[_type]
except AttributeError:
return 0
def media(ur):
logme.debug(__name__+':media')
try:
media_count = ur.find("a", "PhotoRail-headingWithCount js-nav").text.strip().split(" ")[0]
return convertToInt(media_count)
except:
return 0
def verified(ur):
logme.debug(__name__+':verified')
try:
is_verified = ur.find("img", {"alt": "Verified Account"})['alt']
if "Verified Account" in is_verified:
is_verified = 1
else:
is_verified = 0
except:
is_verified = 0
return is_verified
# ur object must be a json from the endpoint https://api.twitter.com/graphql
def User(ur):
logme.debug(__name__+':User')
u = user()
for img in ur.findAll("img", "Emoji Emoji--forText"):
img.replaceWith(img["alt"])
u.id = inf(ur, "id")
u.name = inf(ur, "name")
u.username = inf(ur, "username")
u.bio = card(ur, "bio")
u.location = card(ur, "location")
u.url = card(ur, "url")
u.join_date = join(ur)[1]
u.join_time = join(ur)[0]
u.tweets = stat(ur, "tweets")
u.following = stat(ur, "following")
u.followers = stat(ur, "followers")
u.likes = "" # stat(ur, "favorites")
u.media_count = "" # media(ur)
u.is_private = inf(ur, "private")
u.is_verified = verified(ur)
u.avatar = ur.find("img", {"alt": u.name})["src"]
#u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
return u
logme.debug(__name__ + ':User')
if 'data' not in ur and 'user' not in ur['data']:
msg = 'malformed json! cannot be parsed to get user data'
logme.fatal(msg)
raise KeyError(msg)
_usr = User()
_usr.id = ur['data']['user']['rest_id']
_usr.name = ur['data']['user']['rest_id']['legacy']['name']
_usr.username = ur['data']['user']['rest_id']['legacy']['screen_name']
_usr.bio = ur['data']['user']['rest_id']['legacy']['description']
_usr.location = ur['data']['user']['rest_id']['legacy']['location']
_usr.url = ur['data']['user']['rest_id']['legacy']['screen_name']['url']
# parsing date to user-friendly format
_dt = ur['data']['user']['rest_id']['legacy']['created_at']
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
# date is of the format year,
_usr.join_date = _dt.strftime('%d-%m-%Y')
_usr.join_time = _dt.strftime('%H:%M:%S %Z')
# :type `int`
_usr.tweets = int(ur['data']['user']['rest_id']['legacy']['statuses_count'])
_usr.following = int(ur['data']['user']['rest_id']['legacy']['friends_count'])
_usr.followers = int(ur['data']['user']['rest_id']['legacy']['followers_count'])
_usr.likes = int(ur['data']['user']['rest_id']['legacy']['favourites_count'])
_usr.media_count = int(ur['data']['user']['rest_id']['legacy']['media_count'])
_usr.is_private = ur['data']['user']['rest_id']['legacy']['protected']
_usr.is_verified = ur['data']['user']['rest_id']['legacy']['verified']
_usr.avatar = ur['data']['user']['rest_id']['legacy']['profile_image_url_https']
_usr.background_image = ur['data']['user']['rest_id']['legacy']['profile_banner_url']
# TODO : future implementation
# legacy_extended_profile is also available in some cases which can be used to get DOB of user
return _usr
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment