Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
T
Twint
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Packages
Packages
List
Container Registry
Analytics
Analytics
CI / CD
Code Review
Insights
Issues
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nanahira
Twint
Commits
2d638de0
Commit
2d638de0
authored
Oct 09, 2020
by
Himanshu Dabas
Committed by
GitHub
Oct 09, 2020
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix for deprecation of v1.1 endpoints (#944)
parent
421a155a
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
712 additions
and
409 deletions
+712
-409
twint/cli.py
twint/cli.py
+3
-0
twint/config.py
twint/config.py
+2
-0
twint/feed.py
twint/feed.py
+59
-7
twint/format.py
twint/format.py
+3
-2
twint/get.py
twint/get.py
+110
-62
twint/output.py
twint/output.py
+80
-68
twint/run.py
twint/run.py
+93
-52
twint/token.py
twint/token.py
+63
-0
twint/tweet.py
twint/tweet.py
+171
-63
twint/url.py
twint/url.py
+91
-19
twint/user.py
twint/user.py
+37
-136
No files found.
twint/cli.py
View file @
2d638de0
...
...
@@ -309,3 +309,6 @@ def run_as_command():
sys
.
exit
(
0
)
main
()
if
__name__
==
'__main__'
:
main
()
twint/config.py
View file @
2d638de0
...
...
@@ -81,3 +81,5 @@ class Config:
TranslateDest
:
str
=
"en"
Backoff_exponent
:
float
=
3.0
Min_wait_time
:
int
=
0
Bearer_token
:
str
=
None
Guest_token
:
str
=
None
twint/feed.py
View file @
2d638de0
...
...
@@ -4,32 +4,39 @@ from json import loads
import
logging
as
logme
class
NoMoreTweetsException
(
Exception
):
def
__init__
(
self
,
msg
):
super
()
.
__init__
(
msg
)
def
Follow
(
response
):
logme
.
debug
(
__name__
+
':Follow'
)
logme
.
debug
(
__name__
+
':Follow'
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
follow
=
soup
.
find_all
(
"td"
,
"info fifty screenname"
)
cursor
=
soup
.
find_all
(
"div"
,
"w-button-more"
)
try
:
cursor
=
findall
(
r'cursor=(.*?)">'
,
str
(
cursor
))[
0
]
except
IndexError
:
logme
.
critical
(
__name__
+
':Follow:IndexError'
)
logme
.
critical
(
__name__
+
':Follow:IndexError'
)
return
follow
,
cursor
def
Mobile
(
response
):
logme
.
debug
(
__name__
+
':Mobile'
)
logme
.
debug
(
__name__
+
':Mobile'
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
tweets
=
soup
.
find_all
(
"span"
,
"metadata"
)
max_id
=
soup
.
find_all
(
"div"
,
"w-button-more"
)
try
:
max_id
=
findall
(
r'max_id=(.*?)">'
,
str
(
max_id
))[
0
]
except
Exception
as
e
:
logme
.
critical
(
__name__
+
':Mobile:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':Mobile:'
+
str
(
e
))
return
tweets
,
max_id
def
MobileFav
(
response
):
def
MobileFav
(
response
):
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
tweets
=
soup
.
find_all
(
"table"
,
"tweet"
)
max_id
=
soup
.
find_all
(
"div"
,
"w-button-more"
)
...
...
@@ -40,8 +47,9 @@ def MobileFav(response):
return
tweets
,
max_id
def
profile
(
response
):
logme
.
debug
(
__name__
+
':profile'
)
logme
.
debug
(
__name__
+
':profile'
)
json_response
=
loads
(
response
)
html
=
json_response
[
"items_html"
]
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
...
...
@@ -49,10 +57,54 @@ def profile(response):
return
feed
,
feed
[
-
1
][
"data-item-id"
]
def
Json
(
response
):
logme
.
debug
(
__name__
+
':Json'
)
logme
.
debug
(
__name__
+
':Json'
)
json_response
=
loads
(
response
)
html
=
json_response
[
"items_html"
]
soup
=
BeautifulSoup
(
html
,
"html.parser"
)
feed
=
soup
.
find_all
(
"div"
,
"tweet"
)
return
feed
,
json_response
[
"min_position"
]
def
search_v2
(
response
):
# TODO need to implement this
response
=
loads
(
response
)
if
len
(
response
[
'globalObjects'
][
'tweets'
])
==
0
:
msg
=
'No more data. finished scraping!!'
raise
NoMoreTweetsException
(
msg
)
# need to modify things at the function call end
# timeline = response['timeline']['instructions'][0]['addEntries']['entries']
feed
=
[]
feed_set
=
set
()
# here we need to remove the quoted and `to-reply` tweets from the list as they may or may not contain the
# for _id in response['globalObjects']['tweets']:
# if 'quoted_status_id_str' in response['globalObjects']['tweets'][_id] or \
# response['globalObjects']['tweets'][_id]['in_reply_to_status_id_str']:
# try:
# feed_set.add(response['globalObjects']['tweets'][_id]['quoted_status_id_str'])
# except KeyError:
# feed_set.add(response['globalObjects']['tweets'][_id]['in_reply_to_status_id_str'])
# i = 1
# for _id in response['globalObjects']['tweets']:
# if _id not in feed_set:
# temp_obj = response['globalObjects']['tweets'][_id]
# temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
# feed.append(temp_obj)
for
timeline_entry
in
response
[
'timeline'
][
'instructions'
][
0
][
'addEntries'
][
'entries'
]:
# this will handle the cases when the timeline entry is a tweet
if
timeline_entry
[
'entryId'
]
.
find
(
'sq-I-t-'
)
==
0
:
_id
=
timeline_entry
[
'content'
][
'item'
][
'content'
][
'tweet'
][
'id'
]
temp_obj
=
response
[
'globalObjects'
][
'tweets'
][
_id
]
temp_obj
[
'user_data'
]
=
response
[
'globalObjects'
][
'users'
][
temp_obj
[
'user_id_str'
]]
feed
.
append
(
temp_obj
)
try
:
next_cursor
=
response
[
'timeline'
][
'instructions'
][
0
][
'addEntries'
][
'entries'
][
-
1
][
'content'
][
'operation'
][
'cursor'
][
'value'
]
except
KeyError
:
# this is needed because after the first request location of cursor is changed
next_cursor
=
response
[
'timeline'
][
'instructions'
][
-
1
][
'replaceEntry'
][
'entry'
][
'content'
][
'operation'
][
'cursor'
][
'value'
]
return
feed
,
next_cursor
twint/format.py
View file @
2d638de0
...
...
@@ -37,8 +37,9 @@ def Tweet(config, t):
logme
.
debug
(
__name__
+
':Tweet:notFormat'
)
output
=
f
"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
if
t
.
retweet
:
output
+=
"RT "
# TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
# if t.retweet:
# output += "RT "
output
+=
f
"<{t.username}> {t.tweet}"
...
...
twint/get.py
View file @
2d638de0
...
...
@@ -8,28 +8,40 @@ from fake_useragent import UserAgent
import
asyncio
import
concurrent.futures
import
random
from
json
import
loads
from
json
import
loads
,
dumps
from
aiohttp_socks
import
ProxyConnector
,
ProxyType
from
urllib.parse
import
quote
from
.
import
url
from
.output
import
Tweets
,
Users
from
.
user
import
inf
from
.
token
import
TokenExpiryException
import
logging
as
logme
httpproxy
=
None
user_agent_list
=
[
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
#'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/44.0.2403.157 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
# ' Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
,
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'
,
...
...
@@ -42,11 +54,19 @@ user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko'
,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
,
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)'
,
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
'CLR 3.5.30729)'
,
]
# function to convert python `dict` to json and then encode it to be passed in the url as a parameter
# some urls require this format
def
dict_to_url
(
dct
):
return
quote
(
dumps
(
dct
))
def
get_connector
(
config
):
logme
.
debug
(
__name__
+
':get_connector'
)
logme
.
debug
(
__name__
+
':get_connector'
)
_connector
=
None
if
config
.
Proxy_host
:
if
config
.
Proxy_host
.
lower
()
==
"tor"
:
...
...
@@ -73,82 +93,92 @@ def get_connector(config):
port
=
config
.
Proxy_port
,
rdns
=
True
)
else
:
logme
.
critical
(
__name__
+
':get_connector:proxy-port-type-error'
)
logme
.
critical
(
__name__
+
':get_connector:proxy-port-type-error'
)
print
(
"Error: Please specify --proxy-host, --proxy-port, and --proxy-type"
)
sys
.
exit
(
1
)
else
:
if
config
.
Proxy_port
or
config
.
Proxy_type
:
logme
.
critical
(
__name__
+
':get_connector:proxy-host-arg-error'
)
logme
.
critical
(
__name__
+
':get_connector:proxy-host-arg-error'
)
print
(
"Error: Please specify --proxy-host, --proxy-port, and --proxy-type"
)
sys
.
exit
(
1
)
return
_connector
async
def
RequestUrl
(
config
,
init
,
headers
=
[]):
logme
.
debug
(
__name__
+
':RequestUrl'
)
async
def
RequestUrl
(
config
,
init
,
headers
=
[]):
logme
.
debug
(
__name__
+
':RequestUrl'
)
_connector
=
get_connector
(
config
)
_serialQuery
=
""
params
=
[]
_url
=
""
_headers
=
{}
# TODO : do this later
if
config
.
Profile
:
if
config
.
Profile_full
:
logme
.
debug
(
__name__
+
':RequestUrl:Profile_full'
)
logme
.
debug
(
__name__
+
':RequestUrl:Profile_full'
)
_url
=
await
url
.
MobileProfile
(
config
.
Username
,
init
)
else
:
logme
.
debug
(
__name__
+
':RequestUrl:notProfile_full'
)
logme
.
debug
(
__name__
+
':RequestUrl:notProfile_full'
)
_url
=
await
url
.
Profile
(
config
.
Username
,
init
)
_serialQuery
=
_url
elif
config
.
TwitterSearch
:
logme
.
debug
(
__name__
+
':RequestUrl:TwitterSearch'
)
logme
.
debug
(
__name__
+
':RequestUrl:TwitterSearch'
)
_url
,
params
,
_serialQuery
=
await
url
.
Search
(
config
,
init
)
_headers
=
[(
"authorization"
,
config
.
Bearer_token
),
(
"x-guest-token"
,
config
.
Guest_token
)]
else
:
if
config
.
Following
:
logme
.
debug
(
__name__
+
':RequestUrl:Following'
)
logme
.
debug
(
__name__
+
':RequestUrl:Following'
)
_url
=
await
url
.
Following
(
config
.
Username
,
init
)
elif
config
.
Followers
:
logme
.
debug
(
__name__
+
':RequestUrl:Followers'
)
logme
.
debug
(
__name__
+
':RequestUrl:Followers'
)
_url
=
await
url
.
Followers
(
config
.
Username
,
init
)
else
:
logme
.
debug
(
__name__
+
':RequestUrl:Favorites'
)
logme
.
debug
(
__name__
+
':RequestUrl:Favorites'
)
_url
=
await
url
.
Favorites
(
config
.
Username
,
init
)
_serialQuery
=
_url
response
=
await
Request
(
_url
,
params
=
params
,
connector
=
_connector
,
headers
=
headers
)
response
=
await
Request
(
_url
,
params
=
params
,
connector
=
_connector
,
headers
=
_
headers
)
if
config
.
Debug
:
print
(
_serialQuery
,
file
=
open
(
"twint-request_urls.log"
,
"a"
,
encoding
=
"utf-8"
))
return
response
def
ForceNewTorIdentity
(
config
):
logme
.
debug
(
__name__
+
':ForceNewTorIdentity'
)
logme
.
debug
(
__name__
+
':ForceNewTorIdentity'
)
try
:
tor_c
=
socket
.
create_connection
((
'127.0.0.1'
,
config
.
Tor_control_port
))
tor_c
.
send
(
'AUTHENTICATE "{}"
\r\n
SIGNAL NEWNYM
\r\n
'
.
format
(
config
.
Tor_control_password
)
.
encode
())
response
=
tor_c
.
recv
(
1024
)
if
response
!=
b
'250 OK
\r\n
250 OK
\r\n
'
:
sys
.
stderr
.
write
(
'Unexpected response from Tor control port: {}
\n
'
.
format
(
response
))
logme
.
critical
(
__name__
+
':ForceNewTorIdentity:unexpectedResponse'
)
logme
.
critical
(
__name__
+
':ForceNewTorIdentity:unexpectedResponse'
)
except
Exception
as
e
:
logme
.
debug
(
__name__
+
':ForceNewTorIdentity:errorConnectingTor'
)
logme
.
debug
(
__name__
+
':ForceNewTorIdentity:errorConnectingTor'
)
sys
.
stderr
.
write
(
'Error connecting to Tor control port: {}
\n
'
.
format
(
repr
(
e
)))
sys
.
stderr
.
write
(
'If you want to rotate Tor ports automatically - enable Tor control port
\n
'
)
async
def
Request
(
url
,
connector
=
None
,
params
=
[],
headers
=
[]):
logme
.
debug
(
__name__
+
':Request:Connector'
)
async
def
Request
(
_url
,
connector
=
None
,
params
=
None
,
headers
=
None
):
logme
.
debug
(
__name__
+
':Request:Connector'
)
async
with
aiohttp
.
ClientSession
(
connector
=
connector
,
headers
=
headers
)
as
session
:
return
await
Response
(
session
,
url
,
params
)
return
await
Response
(
session
,
_url
,
params
)
async
def
Response
(
session
,
url
,
params
=
[]
):
logme
.
debug
(
__name__
+
':Response'
)
async
def
Response
(
session
,
_url
,
params
=
None
):
logme
.
debug
(
__name__
+
':Response'
)
with
timeout
(
120
):
async
with
session
.
get
(
url
,
ssl
=
True
,
params
=
params
,
proxy
=
httpproxy
)
as
response
:
return
await
response
.
text
()
async
with
session
.
get
(
_url
,
ssl
=
True
,
params
=
params
,
proxy
=
httpproxy
)
as
response
:
resp
=
await
response
.
text
()
if
response
.
status
==
429
:
# 429 implies Too many requests i.e. Rate Limit Exceeded
raise
TokenExpiryException
(
loads
(
resp
)[
'errors'
][
0
][
'message'
])
return
resp
async
def
RandomUserAgent
(
wa
=
None
):
logme
.
debug
(
__name__
+
':RandomUserAgent'
)
logme
.
debug
(
__name__
+
':RandomUserAgent'
)
try
:
if
wa
:
return
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
...
...
@@ -156,43 +186,61 @@ async def RandomUserAgent(wa=None):
except
:
return
random
.
choice
(
user_agent_list
)
async
def
Username
(
_id
):
logme
.
debug
(
__name__
+
':Username'
)
url
=
f
"https://twitter.com/intent/user?user_id={_id}&lang=en"
r
=
await
Request
(
url
,
headers
=
{
"X-Requested-With"
:
"XMLHttpRequest"
})
soup
=
BeautifulSoup
(
r
,
"html.parser"
)
return
soup
.
find
(
"a"
,
"fn url alternate-context"
)[
"href"
]
.
replace
(
"/"
,
""
)
async
def
Username
(
_id
,
bearer_token
,
guest_token
):
logme
.
debug
(
__name__
+
':Username'
)
_dct
=
{
'userId'
:
_id
,
'withHighlightedLabel'
:
False
}
_url
=
"https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}"
.
format
(
dict_to_url
(
_dct
))
_headers
=
{
'authorization'
:
bearer_token
,
'x-guest-token'
:
guest_token
,
}
r
=
await
Request
(
_url
,
headers
=
_headers
)
j_r
=
loads
(
r
)
username
=
j_r
[
'data'
][
'user'
][
'legacy'
][
'screen_name'
]
return
username
async
def
Tweet
(
url
,
config
,
conn
):
logme
.
debug
(
__name__
+
':Tweet'
)
logme
.
debug
(
__name__
+
':Tweet'
)
try
:
response
=
await
Request
(
url
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
tweets
=
soup
.
find_all
(
"div"
,
"tweet"
)
await
Tweets
(
tweets
,
config
,
conn
,
url
)
except
Exception
as
e
:
logme
.
critical
(
__name__
+
':Tweet:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':Tweet:'
+
str
(
e
))
async
def
User
(
url
,
config
,
conn
,
user_id
=
False
):
logme
.
debug
(
__name__
+
':User'
)
_connector
=
get_connector
(
config
)
async
def
User
(
username
,
config
,
conn
,
bearer_token
,
guest_token
,
user_id
=
False
):
logme
.
debug
(
__name__
+
':User'
)
_dct
=
{
'screen_name'
:
username
,
'withHighlightedLabel'
:
False
}
_url
=
'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'
\
.
format
(
dict_to_url
(
_dct
))
_headers
=
{
'authorization'
:
bearer_token
,
'x-guest-token'
:
guest_token
,
}
try
:
response
=
await
Request
(
url
,
connector
=
_connector
,
headers
=
{
"X-Requested-With"
:
"XMLHttpRequest"
}
)
soup
=
BeautifulSoup
(
response
,
"html.parser"
)
response
=
await
Request
(
_url
,
headers
=
_headers
)
j_r
=
loads
(
response
)
if
user_id
:
return
int
(
inf
(
soup
,
"id"
))
await
Users
(
soup
,
config
,
conn
)
_id
=
j_r
[
'data'
][
'user'
][
'rest_id'
]
return
_id
await
Users
(
j_r
,
config
,
conn
)
except
Exception
as
e
:
logme
.
critical
(
__name__
+
':User:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':User:'
+
str
(
e
))
raise
def
Limit
(
Limit
,
count
):
logme
.
debug
(
__name__
+
':Limit'
)
logme
.
debug
(
__name__
+
':Limit'
)
if
Limit
is
not
None
and
count
>=
int
(
Limit
):
return
True
async
def
Multi
(
feed
,
config
,
conn
):
logme
.
debug
(
__name__
+
':Multi'
)
logme
.
debug
(
__name__
+
':Multi'
)
count
=
0
try
:
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
20
)
as
executor
:
...
...
@@ -201,27 +249,27 @@ async def Multi(feed, config, conn):
for
tweet
in
feed
:
count
+=
1
if
config
.
Favorites
or
config
.
Profile_full
:
logme
.
debug
(
__name__
+
':Multi:Favorites-profileFull'
)
logme
.
debug
(
__name__
+
':Multi:Favorites-profileFull'
)
link
=
tweet
.
find
(
"a"
)[
"href"
]
url
=
f
"https://twitter.com{link}&lang=en"
elif
config
.
User_full
:
logme
.
debug
(
__name__
+
':Multi:userFull'
)
logme
.
debug
(
__name__
+
':Multi:userFull'
)
username
=
tweet
.
find
(
"a"
)[
"name"
]
url
=
f
"http://twitter.com/{username}?lang=en"
else
:
logme
.
debug
(
__name__
+
':Multi:else-url'
)
logme
.
debug
(
__name__
+
':Multi:else-url'
)
link
=
tweet
.
find
(
"a"
,
"tweet-timestamp js-permalink js-nav js-tooltip"
)[
"href"
]
url
=
f
"https://twitter.com{link}?lang=en"
if
config
.
User_full
:
logme
.
debug
(
__name__
+
':Multi:user-full-Run'
)
logme
.
debug
(
__name__
+
':Multi:user-full-Run'
)
futures
.
append
(
loop
.
run_in_executor
(
executor
,
await
User
(
url
,
config
,
conn
)))
config
,
conn
)))
else
:
logme
.
debug
(
__name__
+
':Multi:notUser-full-Run'
)
logme
.
debug
(
__name__
+
':Multi:notUser-full-Run'
)
futures
.
append
(
loop
.
run_in_executor
(
executor
,
await
Tweet
(
url
,
config
,
conn
)))
logme
.
debug
(
__name__
+
':Multi:asyncioGather'
)
config
,
conn
)))
logme
.
debug
(
__name__
+
':Multi:asyncioGather'
)
await
asyncio
.
gather
(
*
futures
)
except
Exception
as
e
:
# TODO: fix error not error
...
...
twint/output.py
View file @
2d638de0
...
...
@@ -17,19 +17,22 @@ author_list.pop()
# used by Pandas
_follows_object
=
{}
def
_formatDateTime
(
datetimestamp
):
try
:
return
int
(
datetime
.
strptime
(
datetimestamp
,
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
.
timestamp
())
except
ValueError
:
return
int
(
datetime
.
strptime
(
datetimestamp
,
"
%
Y-
%
m-
%
d"
)
.
timestamp
())
def
_clean_follow_list
():
logme
.
debug
(
__name__
+
':clean_follow_list'
)
logme
.
debug
(
__name__
+
':clean_follow_list'
)
global
_follows_object
_follows_object
=
{}
def
clean_lists
():
logme
.
debug
(
__name__
+
':clean_lists'
)
logme
.
debug
(
__name__
+
':clean_lists'
)
global
follows_list
global
tweets_list
global
users_list
...
...
@@ -37,55 +40,61 @@ def clean_lists():
tweets_list
=
[]
users_list
=
[]
def
datecheck
(
datetimestamp
,
config
):
logme
.
debug
(
__name__
+
':datecheck'
)
logme
.
debug
(
__name__
+
':datecheck'
)
if
config
.
Since
:
logme
.
debug
(
__name__
+
':datecheck:SinceTrue'
)
logme
.
debug
(
__name__
+
':datecheck:SinceTrue'
)
d
=
_formatDateTime
(
datetimestamp
)
s
=
_formatDateTime
(
config
.
Since
)
if
d
<
s
:
return
False
return
False
if
config
.
Until
:
logme
.
debug
(
__name__
+
':datecheck:UntilTrue'
)
logme
.
debug
(
__name__
+
':datecheck:UntilTrue'
)
d
=
_formatDateTime
(
datetimestamp
)
s
=
_formatDateTime
(
config
.
Until
)
if
d
>
s
:
return
False
logme
.
debug
(
__name__
+
':datecheck:dateRangeFalse'
)
return
False
logme
.
debug
(
__name__
+
':datecheck:dateRangeFalse'
)
return
True
# TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
# `tweets` list along with the other tweets
def
is_tweet
(
tw
):
try
:
tw
[
"data-item-id"
]
logme
.
debug
(
__name__
+
':is_tweet:True'
)
logme
.
debug
(
__name__
+
':is_tweet:True'
)
return
True
except
:
logme
.
critical
(
__name__
+
':is_tweet:False'
)
logme
.
critical
(
__name__
+
':is_tweet:False'
)
return
False
def
_output
(
obj
,
output
,
config
,
**
extra
):
logme
.
debug
(
__name__
+
':_output'
)
logme
.
debug
(
__name__
+
':_output'
)
if
config
.
Lowercase
:
if
isinstance
(
obj
,
str
):
logme
.
debug
(
__name__
+
':_output:Lowercase:username'
)
logme
.
debug
(
__name__
+
':_output:Lowercase:username'
)
obj
=
obj
.
lower
()
elif
obj
.
__class__
.
__name__
==
"user"
:
logme
.
debug
(
__name__
+
':_output:Lowercase:user'
)
logme
.
debug
(
__name__
+
':_output:Lowercase:user'
)
pass
elif
obj
.
__class__
.
__name__
==
"tweet"
:
logme
.
debug
(
__name__
+
':_output:Lowercase:tweet'
)
logme
.
debug
(
__name__
+
':_output:Lowercase:tweet'
)
obj
.
username
=
obj
.
username
.
lower
()
author_list
.
update
({
obj
.
username
})
for
i
in
range
(
len
(
obj
.
mentions
)):
obj
.
mentions
[
i
]
=
obj
.
mentions
[
i
]
.
lower
()
for
i
in
range
(
len
(
obj
.
hashtags
)):
obj
.
hashtags
[
i
]
=
obj
.
hashtags
[
i
]
.
lower
()
for
i
in
range
(
len
(
obj
.
cashtags
)):
obj
.
cashtags
[
i
]
=
obj
.
cashtags
[
i
]
.
lower
()
# TODO : dont know what cashtags are, <also modify in tweet.py>
# for i in range(len(obj.cashtags)):
# obj.cashtags[i] = obj.cashtags[i].lower()
else
:
logme
.
info
(
'_output:Lowercase:hiddenTweetFound'
)
print
(
"[x] Hidden tweet found, account suspended due to violation of TOS"
)
...
...
@@ -94,93 +103,95 @@ def _output(obj, output, config, **extra):
if
config
.
Store_csv
:
try
:
write
.
Csv
(
obj
,
config
)
logme
.
debug
(
__name__
+
':_output:CSV'
)
logme
.
debug
(
__name__
+
':_output:CSV'
)
except
Exception
as
e
:
logme
.
critical
(
__name__
+
':_output:CSV:Error:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':_output:CSV:Error:'
+
str
(
e
))
print
(
str
(
e
)
+
" [x] output._output"
)
elif
config
.
Store_json
:
write
.
Json
(
obj
,
config
)
logme
.
debug
(
__name__
+
':_output:JSON'
)
logme
.
debug
(
__name__
+
':_output:JSON'
)
else
:
write
.
Text
(
output
,
config
.
Output
)
logme
.
debug
(
__name__
+
':_output:Text'
)
logme
.
debug
(
__name__
+
':_output:Text'
)
if
config
.
Elasticsearch
:
logme
.
debug
(
__name__
+
':_output:Elasticsearch'
)
logme
.
debug
(
__name__
+
':_output:Elasticsearch'
)
print
(
""
,
end
=
"."
,
flush
=
True
)
else
:
if
not
config
.
Hide_output
:
try
:
print
(
output
.
replace
(
'
\n
'
,
' '
))
except
UnicodeEncodeError
:
logme
.
critical
(
__name__
+
':_output:UnicodeEncodeError'
)
logme
.
critical
(
__name__
+
':_output:UnicodeEncodeError'
)
print
(
"unicode error [x] output._output"
)
async
def
checkData
(
tweet
,
config
,
conn
):
logme
.
debug
(
__name__
+
':checkData'
)
copyright
=
tweet
.
find
(
"div"
,
"StreamItemContent--withheld"
)
if
copyright
is
None
and
is_tweet
(
tweet
):
tweet
=
Tweet
(
tweet
,
config
)
logme
.
debug
(
__name__
+
':checkData'
)
if
not
tweet
.
datestamp
:
logme
.
critical
(
__name__
+
':checkData:hiddenTweetFound'
)
print
(
"[x] Hidden tweet found, account suspended due to violation of TOS"
)
return
tweet
=
Tweet
(
tweet
,
config
)
if
datecheck
(
tweet
.
datestamp
+
" "
+
tweet
.
timestamp
,
config
):
output
=
format
.
Tweet
(
config
,
tweet
)
if
not
tweet
.
datestamp
:
logme
.
critical
(
__name__
+
':checkData:hiddenTweetFound'
)
print
(
"[x] Hidden tweet found, account suspended due to violation of TOS"
)
return
if
config
.
Database
:
logme
.
debug
(
__name__
+
':checkData:Database'
)
db
.
tweets
(
conn
,
tweet
,
config
)
if
datecheck
(
tweet
.
datestamp
+
" "
+
tweet
.
timestamp
,
config
):
output
=
format
.
Tweet
(
config
,
tweet
)
if
config
.
Pandas
:
logme
.
debug
(
__name__
+
':checkData:Pandas
'
)
panda
.
update
(
tweet
,
config
)
if
config
.
Database
:
logme
.
debug
(
__name__
+
':checkData:Database
'
)
db
.
tweets
(
conn
,
tweet
,
config
)
if
config
.
Store_object
:
logme
.
debug
(
__name__
+
':checkData:Store_object'
)
if
hasattr
(
config
.
Store_object_tweets_list
,
'append'
):
config
.
Store_object_tweets_list
.
append
(
tweet
)
else
:
tweets_list
.
append
(
tweet
)
if
config
.
Pandas
:
logme
.
debug
(
__name__
+
':checkData:Pandas'
)
panda
.
update
(
tweet
,
config
)
if
config
.
Elasticsearch
:
logme
.
debug
(
__name__
+
':checkData:Elasticsearch'
)
elasticsearch
.
Tweet
(
tweet
,
config
)
if
config
.
Store_object
:
logme
.
debug
(
__name__
+
':checkData:Store_object'
)
if
hasattr
(
config
.
Store_object_tweets_list
,
'append'
):
config
.
Store_object_tweets_list
.
append
(
tweet
)
else
:
tweets_list
.
append
(
tweet
)
if
config
.
Elasticsearch
:
logme
.
debug
(
__name__
+
':checkData:Elasticsearch'
)
elasticsearch
.
Tweet
(
tweet
,
config
)
_output
(
tweet
,
output
,
config
)
# else:
# logme.critical(__name__+':checkData:copyrightedTweet')
_output
(
tweet
,
output
,
config
)
else
:
logme
.
critical
(
__name__
+
':checkData:copyrightedTweet'
)
async
def
Tweets
(
tweets
,
config
,
conn
,
url
=
''
):
logme
.
debug
(
__name__
+
':Tweets'
)
logme
.
debug
(
__name__
+
':Tweets'
)
if
config
.
Favorites
or
config
.
Profile_full
or
config
.
Location
:
logme
.
debug
(
__name__
+
':Tweets:fav+full+loc'
)
logme
.
debug
(
__name__
+
':Tweets:fav+full+loc'
)
for
tw
in
tweets
:
if
tw
[
'data-item-id'
]
==
url
.
split
(
'?'
)[
0
]
.
split
(
'/'
)[
-
1
]:
await
checkData
(
tw
,
config
,
conn
)
elif
config
.
TwitterSearch
:
logme
.
debug
(
__name__
+
':Tweets:TwitterSearch'
)
logme
.
debug
(
__name__
+
':Tweets:TwitterSearch'
)
await
checkData
(
tweets
,
config
,
conn
)
else
:
logme
.
debug
(
__name__
+
':Tweets:else'
)
logme
.
debug
(
__name__
+
':Tweets:else'
)
if
int
(
tweets
[
"data-user-id"
])
==
config
.
User_id
or
config
.
Retweets
:
await
checkData
(
tweets
,
config
,
conn
)
async
def
Users
(
u
,
config
,
conn
):
logme
.
debug
(
__name__
+
':User'
)
logme
.
debug
(
__name__
+
':User'
)
global
users_list
user
=
User
(
u
)
output
=
format
.
User
(
config
.
Format
,
user
)
if
config
.
Database
:
logme
.
debug
(
__name__
+
':User:Database'
)
logme
.
debug
(
__name__
+
':User:Database'
)
db
.
user
(
conn
,
config
,
user
)
if
config
.
Elasticsearch
:
logme
.
debug
(
__name__
+
':User:Elasticsearch'
)
logme
.
debug
(
__name__
+
':User:Elasticsearch'
)
_save_date
=
user
.
join_date
_save_time
=
user
.
join_time
user
.
join_date
=
str
(
datetime
.
strptime
(
user
.
join_date
,
"
%
d
%
b
%
Y"
))
.
split
()[
0
]
...
...
@@ -190,49 +201,50 @@ async def Users(u, config, conn):
user
.
join_time
=
_save_time
if
config
.
Store_object
:
logme
.
debug
(
__name__
+
':User:Store_object'
)
logme
.
debug
(
__name__
+
':User:Store_object'
)
if
hasattr
(
config
.
Store_object_follow_list
,
'append'
):
config
.
Store_object_follow_list
.
append
(
user
)
elif
hasattr
(
config
.
Store_object_users_list
,
'append'
):
config
.
Store_object_users_list
.
append
(
user
)
else
:
users_list
.
append
(
user
)
# twint.user.user
users_list
.
append
(
user
)
# twint.user.user
if
config
.
Pandas
:
logme
.
debug
(
__name__
+
':User:Pandas+user'
)
logme
.
debug
(
__name__
+
':User:Pandas+user'
)
panda
.
update
(
user
,
config
)
_output
(
user
,
output
,
config
)
async
def
Username
(
username
,
config
,
conn
):
logme
.
debug
(
__name__
+
':Username'
)
logme
.
debug
(
__name__
+
':Username'
)
global
_follows_object
global
follows_list
follow_var
=
config
.
Following
*
"following"
+
config
.
Followers
*
"followers"
follow_var
=
config
.
Following
*
"following"
+
config
.
Followers
*
"followers"
if
config
.
Database
:
logme
.
debug
(
__name__
+
':Username:Database'
)
logme
.
debug
(
__name__
+
':Username:Database'
)
db
.
follow
(
conn
,
config
.
Username
,
config
.
Followers
,
username
)
if
config
.
Elasticsearch
:
logme
.
debug
(
__name__
+
':Username:Elasticsearch'
)
logme
.
debug
(
__name__
+
':Username:Elasticsearch'
)
elasticsearch
.
Follow
(
username
,
config
)
if
config
.
Store_object
:
if
hasattr
(
config
.
Store_object_follow_list
,
'append'
):
config
.
Store_object_follow_list
.
append
(
username
)
else
:
follows_list
.
append
(
username
)
# twint.user.user
follows_list
.
append
(
username
)
# twint.user.user
if
config
.
Pandas
:
logme
.
debug
(
__name__
+
':Username:object+pandas'
)
logme
.
debug
(
__name__
+
':Username:object+pandas'
)
try
:
_
=
_follows_object
[
config
.
Username
][
follow_var
]
except
KeyError
:
_follows_object
.
update
({
config
.
Username
:
{
follow_var
:
[]}})
_follows_object
[
config
.
Username
][
follow_var
]
.
append
(
username
)
if
config
.
Pandas_au
:
logme
.
debug
(
__name__
+
':Username:object+pandas+au'
)
logme
.
debug
(
__name__
+
':Username:object+pandas+au'
)
panda
.
update
(
_follows_object
[
config
.
Username
],
config
)
_output
(
username
,
username
,
config
)
twint/run.py
View file @
2d638de0
import
sys
,
os
,
time
,
datetime
import
sys
,
os
,
datetime
from
asyncio
import
get_event_loop
,
TimeoutError
,
ensure_future
,
new_event_loop
,
set_event_loop
from
.
import
datelock
,
feed
,
get
,
output
,
verbose
,
storage
from
.token
import
TokenExpiryException
from
.
import
token
from
.storage
import
db
from
.feed
import
NoMoreTweetsException
import
logging
as
logme
import
time
bearer
=
'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs'
\
'
%3
D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
class
Twint
:
def
__init__
(
self
,
config
):
logme
.
debug
(
__name__
+
':Twint:__init__'
)
logme
.
debug
(
__name__
+
':Twint:__init__'
)
if
config
.
Resume
is
not
None
and
(
config
.
TwitterSearch
or
config
.
Followers
or
config
.
Following
):
logme
.
debug
(
__name__
+
':Twint:__init__:Resume'
)
logme
.
debug
(
__name__
+
':Twint:__init__:Resume'
)
self
.
init
=
self
.
get_resume
(
config
.
Resume
)
else
:
self
.
init
=
'-1'
...
...
@@ -21,16 +28,21 @@ class Twint:
self
.
count
=
0
self
.
user_agent
=
""
self
.
config
=
config
self
.
config
.
Bearer_token
=
bearer
# TODO might have to make some adjustments for it to work with multi-treading
# USAGE : to get a new guest token simply do `self.token.refresh()`
self
.
token
=
token
.
Token
(
config
)
self
.
token
.
refresh
()
self
.
conn
=
db
.
Conn
(
config
.
Database
)
self
.
d
=
datelock
.
Set
(
self
.
config
.
Until
,
self
.
config
.
Since
)
verbose
.
Elastic
(
config
.
Elasticsearch
)
if
self
.
config
.
Store_object
:
logme
.
debug
(
__name__
+
':Twint:__init__:clean_follow_list'
)
logme
.
debug
(
__name__
+
':Twint:__init__:clean_follow_list'
)
output
.
_clean_follow_list
()
if
self
.
config
.
Pandas_clean
:
logme
.
debug
(
__name__
+
':Twint:__init__:pandas_clean'
)
logme
.
debug
(
__name__
+
':Twint:__init__:pandas_clean'
)
storage
.
panda
.
clean
()
def
get_resume
(
self
,
resumeFile
):
...
...
@@ -41,10 +53,17 @@ class Twint:
return
_init
async
def
Feed
(
self
):
logme
.
debug
(
__name__
+
':Twint:Feed'
)
logme
.
debug
(
__name__
+
':Twint:Feed'
)
consecutive_errors_count
=
0
while
True
:
response
=
await
get
.
RequestUrl
(
self
.
config
,
self
.
init
,
headers
=
[(
"User-Agent"
,
self
.
user_agent
)])
# this will receive a JSON string, parse it into a `dict` and do the required stuff
try
:
response
=
await
get
.
RequestUrl
(
self
.
config
,
self
.
init
,
headers
=
[(
"User-Agent"
,
self
.
user_agent
)])
except
TokenExpiryException
as
e
:
logme
.
debug
(
__name__
+
'Twint:Feed:'
+
str
(
e
))
self
.
token
.
refresh
()
response
=
await
get
.
RequestUrl
(
self
.
config
,
self
.
init
,
headers
=
[(
"User-Agent"
,
self
.
user_agent
)])
if
self
.
config
.
Debug
:
print
(
response
,
file
=
open
(
"twint-last-request.log"
,
"w"
,
encoding
=
"utf-8"
))
...
...
@@ -75,29 +94,36 @@ class Twint:
else
:
self
.
feed
,
self
.
init
=
feed
.
profile
(
response
)
elif
self
.
config
.
TwitterSearch
:
self
.
feed
,
self
.
init
=
feed
.
Json
(
response
)
try
:
self
.
feed
,
self
.
init
=
feed
.
search_v2
(
response
)
except
NoMoreTweetsException
as
e
:
logme
.
debug
(
__name__
+
':Twint:Feed:'
+
str
(
e
))
print
(
e
,
'is it though? because sometimes twitter lie.'
)
break
except
TimeoutError
as
e
:
if
self
.
config
.
Proxy_host
.
lower
()
==
"tor"
:
print
(
"[?] Timed out, changing Tor identity..."
)
if
self
.
config
.
Tor_control_password
is
None
:
logme
.
critical
(
__name__
+
':Twint:Feed:tor-password'
)
logme
.
critical
(
__name__
+
':Twint:Feed:tor-password'
)
sys
.
stderr
.
write
(
"Error: config.Tor_control_password must be set for proxy autorotation!
\r\n
"
)
sys
.
stderr
.
write
(
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly
\r\n
"
)
sys
.
stderr
.
write
(
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly
\r\n
"
)
break
else
:
get
.
ForceNewTorIdentity
(
self
.
config
)
continue
else
:
logme
.
critical
(
__name__
+
':Twint:Feed:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':Twint:Feed:'
+
str
(
e
))
print
(
str
(
e
))
break
except
Exception
as
e
:
if
self
.
config
.
Profile
or
self
.
config
.
Favorites
:
print
(
"[!] Twitter does not return more data, scrape stops here."
)
break
logme
.
critical
(
__name__
+
':Twint:Feed:noData'
+
str
(
e
))
logme
.
critical
(
__name__
+
':Twint:Feed:noData'
+
str
(
e
))
# Sometimes Twitter says there is no data. But it's a lie.
# raise
consecutive_errors_count
+=
1
if
consecutive_errors_count
<
self
.
config
.
Retries_count
:
# skip to the next iteration if wait time does not satisfy limit constraints
...
...
@@ -111,9 +137,10 @@ class Twint:
time
.
sleep
(
delay
)
self
.
user_agent
=
await
get
.
RandomUserAgent
(
wa
=
True
)
continue
logme
.
critical
(
__name__
+
':Twint:Feed:Tweets_known_error:'
+
str
(
e
))
logme
.
critical
(
__name__
+
':Twint:Feed:Tweets_known_error:'
+
str
(
e
))
sys
.
stderr
.
write
(
str
(
e
)
+
" [x] run.Feed"
)
sys
.
stderr
.
write
(
"[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!"
)
sys
.
stderr
.
write
(
"[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!"
)
break
if
self
.
config
.
Resume
:
print
(
self
.
init
,
file
=
open
(
self
.
config
.
Resume
,
"a"
,
encoding
=
"utf-8"
))
...
...
@@ -121,17 +148,17 @@ class Twint:
async
def
follow
(
self
):
await
self
.
Feed
()
if
self
.
config
.
User_full
:
logme
.
debug
(
__name__
+
':Twint:follow:userFull'
)
logme
.
debug
(
__name__
+
':Twint:follow:userFull'
)
self
.
count
+=
await
get
.
Multi
(
self
.
feed
,
self
.
config
,
self
.
conn
)
else
:
logme
.
debug
(
__name__
+
':Twint:follow:notUserFull'
)
logme
.
debug
(
__name__
+
':Twint:follow:notUserFull'
)
for
user
in
self
.
feed
:
self
.
count
+=
1
username
=
user
.
find
(
"a"
)[
"name"
]
await
output
.
Username
(
username
,
self
.
config
,
self
.
conn
)
async
def
favorite
(
self
):
logme
.
debug
(
__name__
+
':Twint:favorite'
)
logme
.
debug
(
__name__
+
':Twint:favorite'
)
await
self
.
Feed
()
favorited_tweets_list
=
[]
for
tweet
in
self
.
feed
:
...
...
@@ -182,21 +209,22 @@ class Twint:
async
def
profile
(
self
):
await
self
.
Feed
()
if
self
.
config
.
Profile_full
:
logme
.
debug
(
__name__
+
':Twint:profileFull'
)
logme
.
debug
(
__name__
+
':Twint:profileFull'
)
self
.
count
+=
await
get
.
Multi
(
self
.
feed
,
self
.
config
,
self
.
conn
)
else
:
logme
.
debug
(
__name__
+
':Twint:notProfileFull'
)
logme
.
debug
(
__name__
+
':Twint:notProfileFull'
)
for
tweet
in
self
.
feed
:
self
.
count
+=
1
await
output
.
Tweets
(
tweet
,
self
.
config
,
self
.
conn
)
async
def
tweets
(
self
):
await
self
.
Feed
()
# TODO : need to take care of this later
if
self
.
config
.
Location
:
logme
.
debug
(
__name__
+
':Twint:tweets:location'
)
logme
.
debug
(
__name__
+
':Twint:tweets:location'
)
self
.
count
+=
await
get
.
Multi
(
self
.
feed
,
self
.
config
,
self
.
conn
)
else
:
logme
.
debug
(
__name__
+
':Twint:tweets:notLocation'
)
logme
.
debug
(
__name__
+
':Twint:tweets:notLocation'
)
for
tweet
in
self
.
feed
:
self
.
count
+=
1
await
output
.
Tweets
(
tweet
,
self
.
config
,
self
.
conn
)
...
...
@@ -217,75 +245,82 @@ class Twint:
self
.
user_agent
=
await
get
.
RandomUserAgent
()
if
self
.
config
.
User_id
is
not
None
and
self
.
config
.
Username
is
None
:
logme
.
debug
(
__name__
+
':Twint:main:user_id'
)
self
.
config
.
Username
=
await
get
.
Username
(
self
.
config
.
User_id
)
logme
.
debug
(
__name__
+
':Twint:main:user_id'
)
self
.
config
.
Username
=
await
get
.
Username
(
self
.
config
.
User_id
,
self
.
config
.
Bearer_token
,
self
.
config
.
Guest_token
)
if
self
.
config
.
Username
is
not
None
and
self
.
config
.
User_id
is
None
:
logme
.
debug
(
__name__
+
':Twint:main:username'
)
url
=
f
"https://twitter.com/{self.config.Username}?lang=en"
self
.
config
.
User_id
=
await
get
.
User
(
url
,
self
.
config
,
self
.
conn
,
True
)
logme
.
debug
(
__name__
+
':Twint:main:username'
)
self
.
config
.
User_id
=
await
get
.
User
(
self
.
config
.
Username
,
self
.
config
,
self
.
conn
,
self
.
config
.
Bearer_token
,
self
.
config
.
Guest_token
,
True
)
if
self
.
config
.
User_id
is
None
:
raise
ValueError
(
"Cannot find twitter account with name = "
+
self
.
config
.
Username
)
# TODO : will need to modify it to work with the new endpoints
if
self
.
config
.
TwitterSearch
and
self
.
config
.
Since
and
self
.
config
.
Until
:
logme
.
debug
(
__name__
+
':Twint:main:search+since+until'
)
logme
.
debug
(
__name__
+
':Twint:main:search+since+until'
)
while
self
.
d
.
_since
<
self
.
d
.
_until
:
self
.
config
.
Since
=
str
(
self
.
d
.
_since
)
self
.
config
.
Until
=
str
(
self
.
d
.
_until
)
if
len
(
self
.
feed
)
>
0
:
await
self
.
tweets
()
else
:
logme
.
debug
(
__name__
+
':Twint:main:gettingNewTweets'
)
logme
.
debug
(
__name__
+
':Twint:main:gettingNewTweets'
)
break
if
get
.
Limit
(
self
.
config
.
Limit
,
self
.
count
):
break
else
:
logme
.
debug
(
__name__
+
':Twint:main:not-search+since+until'
)
logme
.
debug
(
__name__
+
':Twint:main:not-search+since+until'
)
while
True
:
if
len
(
self
.
feed
)
>
0
:
if
self
.
config
.
Followers
or
self
.
config
.
Following
:
logme
.
debug
(
__name__
+
':Twint:main:follow'
)
logme
.
debug
(
__name__
+
':Twint:main:follow'
)
await
self
.
follow
()
elif
self
.
config
.
Favorites
:
logme
.
debug
(
__name__
+
':Twint:main:favorites'
)
logme
.
debug
(
__name__
+
':Twint:main:favorites'
)
await
self
.
favorite
()
elif
self
.
config
.
Profile
:
logme
.
debug
(
__name__
+
':Twint:main:profile'
)
logme
.
debug
(
__name__
+
':Twint:main:profile'
)
await
self
.
profile
()
elif
self
.
config
.
TwitterSearch
:
logme
.
debug
(
__name__
+
':Twint:main:twitter-search'
)
logme
.
debug
(
__name__
+
':Twint:main:twitter-search'
)
await
self
.
tweets
()
else
:
logme
.
debug
(
__name__
+
':Twint:main:no-more-tweets'
)
logme
.
debug
(
__name__
+
':Twint:main:no-more-tweets'
)
break
#logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
#
logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
if
get
.
Limit
(
self
.
config
.
Limit
,
self
.
count
):
logme
.
debug
(
__name__
+
':Twint:main:reachedLimit'
)
logme
.
debug
(
__name__
+
':Twint:main:reachedLimit'
)
break
if
self
.
config
.
Count
:
verbose
.
Count
(
self
.
count
,
self
.
config
)
def
run
(
config
,
callback
=
None
):
logme
.
debug
(
__name__
+
':run'
)
logme
.
debug
(
__name__
+
':run'
)
try
:
get_event_loop
()
except
RuntimeError
as
e
:
if
"no current event loop"
in
str
(
e
):
set_event_loop
(
new_event_loop
())
else
:
logme
.
exception
(
__name__
+
':Lookup
:Unexpected exception while handling an expected RuntimeError.'
)
logme
.
exception
(
__name__
+
':run
:Unexpected exception while handling an expected RuntimeError.'
)
raise
except
Exception
as
e
:
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception occured while attempting to get or create a new event loop.'
)
logme
.
exception
(
__name__
+
':run:Unexpected exception occurred while attempting to get or create a new event loop.'
)
raise
get_event_loop
()
.
run_until_complete
(
Twint
(
config
)
.
main
(
callback
))
def
Favorites
(
config
):
logme
.
debug
(
__name__
+
':Favorites'
)
logme
.
debug
(
__name__
+
':Favorites'
)
config
.
Favorites
=
True
config
.
Following
=
False
config
.
Followers
=
False
...
...
@@ -296,8 +331,9 @@ def Favorites(config):
if
config
.
Pandas_au
:
storage
.
panda
.
_autoget
(
"tweet"
)
def
Followers
(
config
):
logme
.
debug
(
__name__
+
':Followers'
)
logme
.
debug
(
__name__
+
':Followers'
)
config
.
Followers
=
True
config
.
Following
=
False
config
.
Profile
=
False
...
...
@@ -310,11 +346,12 @@ def Followers(config):
if
config
.
User_full
:
storage
.
panda
.
_autoget
(
"user"
)
if
config
.
Pandas_clean
and
not
config
.
Store_object
:
#storage.panda.clean()
#
storage.panda.clean()
output
.
_clean_follow_list
()
def
Following
(
config
):
logme
.
debug
(
__name__
+
':Following'
)
logme
.
debug
(
__name__
+
':Following'
)
config
.
Following
=
True
config
.
Followers
=
False
config
.
Profile
=
False
...
...
@@ -327,11 +364,12 @@ def Following(config):
if
config
.
User_full
:
storage
.
panda
.
_autoget
(
"user"
)
if
config
.
Pandas_clean
and
not
config
.
Store_object
:
#storage.panda.clean()
#
storage.panda.clean()
output
.
_clean_follow_list
()
def
Lookup
(
config
):
logme
.
debug
(
__name__
+
':Lookup'
)
logme
.
debug
(
__name__
+
':Lookup'
)
try
:
get_event_loop
()
...
...
@@ -339,15 +377,16 @@ def Lookup(config):
if
"no current event loop"
in
str
(
e
):
set_event_loop
(
new_event_loop
())
else
:
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception while handling an expected RuntimeError.'
)
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception while handling an expected RuntimeError.'
)
raise
except
Exception
as
e
:
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception occured while attempting to get or create a new event loop.'
)
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception occured while attempting to get or create a new event loop.'
)
raise
try
:
if
config
.
User_id
is
not
None
:
logme
.
debug
(
__name__
+
':Twint:Lookup:user_id'
)
logme
.
debug
(
__name__
+
':Twint:Lookup:user_id'
)
config
.
Username
=
get_event_loop
()
.
run_until_complete
(
get
.
Username
(
config
.
User_id
))
url
=
f
"https://mobile.twitter.com/{config.Username}?prefetchTimestamp="
+
str
(
int
(
time
.
time
()
*
1000
))
...
...
@@ -357,15 +396,16 @@ def Lookup(config):
storage
.
panda
.
_autoget
(
"user"
)
except
RuntimeError
as
e
:
if
"no current event loop"
in
str
(
e
):
logme
.
exception
(
__name__
+
':Lookup:Previous attempt to to create an event loop failed.'
)
logme
.
exception
(
__name__
+
':Lookup:Previous attempt to to create an event loop failed.'
)
raise
except
Exception
as
e
:
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception occured.'
)
logme
.
exception
(
__name__
+
':Lookup:Unexpected exception occured.'
)
raise
def
Profile
(
config
):
logme
.
debug
(
__name__
+
':Profile'
)
logme
.
debug
(
__name__
+
':Profile'
)
config
.
Profile
=
True
config
.
Favorites
=
False
config
.
Following
=
False
...
...
@@ -375,8 +415,9 @@ def Profile(config):
if
config
.
Pandas_au
:
storage
.
panda
.
_autoget
(
"tweet"
)
def
Search
(
config
,
callback
=
None
):
logme
.
debug
(
__name__
+
':Search'
)
logme
.
debug
(
__name__
+
':Search'
)
config
.
TwitterSearch
=
True
config
.
Favorites
=
False
config
.
Following
=
False
...
...
twint/token.py
0 → 100644
View file @
2d638de0
import
re
import
time
import
requests
import
logging
as
logme
class
TokenExpiryException
(
Exception
):
def
__init__
(
self
,
msg
):
super
()
.
__init__
(
msg
)
class
Token
:
def
__init__
(
self
,
config
):
self
.
_session
=
requests
.
Session
()
self
.
config
=
config
self
.
_retries
=
5
self
.
_timeout
=
10
self
.
url
=
'https://twitter.com'
def
_request
(
self
):
for
attempt
in
range
(
self
.
_retries
+
1
):
# The request is newly prepared on each retry because of potential cookie updates.
req
=
self
.
_session
.
prepare_request
(
requests
.
Request
(
'GET'
,
self
.
url
))
logme
.
debug
(
f
'Retrieving {req.url}'
)
try
:
r
=
self
.
_session
.
send
(
req
,
allow_redirects
=
True
,
timeout
=
self
.
_timeout
)
except
requests
.
exceptions
.
RequestException
as
exc
:
if
attempt
<
self
.
_retries
:
retrying
=
', retrying'
level
=
logme
.
WARNING
else
:
retrying
=
''
level
=
logme
.
ERROR
logme
.
log
(
level
,
f
'Error retrieving {req.url}: {exc!r}{retrying}'
)
else
:
success
,
msg
=
(
True
,
None
)
msg
=
f
': {msg}'
if
msg
else
''
if
success
:
logme
.
debug
(
f
'{req.url} retrieved successfully{msg}'
)
return
r
if
attempt
<
self
.
_retries
:
# TODO : might wanna tweak this back-off timer
sleep_time
=
2.0
*
2
**
attempt
logme
.
info
(
f
'Waiting {sleep_time:.0f} seconds'
)
time
.
sleep
(
sleep_time
)
else
:
msg
=
f
'{self._retries + 1} requests to {self.url} failed, giving up.'
logme
.
fatal
(
msg
)
self
.
config
.
Guest_token
=
None
raise
RefreshTokenException
(
msg
)
def
refresh
(
self
):
logme
.
debug
(
'Retrieving guest token'
)
res
=
self
.
_request
()
match
=
re
.
search
(
r'\("gt=(\d+);'
,
res
.
text
)
if
match
:
logme
.
debug
(
'Found guest token in HTML'
)
self
.
config
.
Guest_token
=
str
(
match
.
group
(
1
))
else
:
self
.
config
.
Guest_token
=
None
raise
RefreshTokenException
(
'Could not find the Guest token in HTML'
)
twint/tweet.py
View file @
2d638de0
from
time
import
strftime
,
localtime
from
datetime
import
datetime
from
datetime
import
datetime
,
timezone
import
json
import
logging
as
logme
...
...
@@ -9,6 +9,7 @@ from googletransx import Translator
# - https://github.com/x0rzkov/py-googletrans#basic-usage
translator
=
Translator
()
class
tweet
:
"""Define Tweet class
"""
...
...
@@ -17,52 +18,63 @@ class tweet:
def
__init__
(
self
):
pass
def
utc_to_local
(
utc_dt
):
return
utc_dt
.
replace
(
tzinfo
=
timezone
.
utc
)
.
astimezone
(
tz
=
None
)
def
getMentions
(
tw
):
"""Extract ment from tweet
"""Extract ment
ions
from tweet
"""
logme
.
debug
(
__name__
+
':getMentions'
)
logme
.
debug
(
__name__
+
':getMentions'
)
mentions
=
[]
try
:
mentions
=
tw
[
"data-mentions"
]
.
split
(
" "
)
except
:
for
mention
in
tw
[
'entities'
][
'user_mentions'
]:
mentions
.
append
(
mention
[
'screen_name'
])
except
KeyError
:
mentions
=
[]
return
mentions
def
getQuoteURL
(
tw
):
"""Extract quote from tweet
"""
logme
.
debug
(
__name__
+
':getQuoteURL'
)
logme
.
debug
(
__name__
+
':getQuoteURL'
)
base_twitter
=
"https://twitter.com"
quote_url
=
""
try
:
quote
=
tw
.
find
(
"div"
,
"QuoteTweet-innerContainer"
)
quote
=
tw
.
find
(
"div"
,
"QuoteTweet-innerContainer"
)
quote_url
=
base_twitter
+
quote
.
get
(
"href"
)
except
:
quote_url
=
""
return
quote_url
def
getText
(
tw
):
"""Replace some text
"""
logme
.
debug
(
__name__
+
':getText'
)
text
=
tw
.
find
(
"p"
,
"tweet-text"
)
.
text
text
=
text
.
replace
(
"http"
,
" http"
)
text
=
text
.
replace
(
"pic.twitter"
,
" pic.twitter"
)
return
text
# def getText(tw):
# """Replace some text
# """
# logme.debug(__name__ + ':getText')
# text = tw.find("p", "tweet-text").text
# text = text.replace("http", " http")
# text = text.replace("pic.twitter", " pic.twitter")
#
# return text
def
getStat
(
tw
,
_type
):
"""Get stats about Tweet
"""
logme
.
debug
(
__name__
+
':getStat'
)
logme
.
debug
(
__name__
+
':getStat'
)
st
=
f
"ProfileTweet-action--{_type} u-hiddenVisually"
return
tw
.
find
(
"span"
,
st
)
.
find
(
"span"
)[
"data-tweet-stat-count"
]
def
getRetweet
(
tw
,
_config
):
"""Get Retweet
"""
logme
.
debug
(
__name__
+
':getRetweet'
)
logme
.
debug
(
__name__
+
':getRetweet'
)
if
_config
.
Profile
:
if
int
(
tw
[
"data-user-id"
])
!=
_config
.
User_id
:
return
_config
.
User_id
,
_config
.
Username
...
...
@@ -71,63 +83,159 @@ def getRetweet(tw, _config):
if
_rt_object
:
_rt_id
=
_rt_object
.
find
(
'a'
)[
'data-user-id'
]
_rt_username
=
_rt_object
.
find
(
'a'
)[
'href'
][
1
:]
return
_rt_id
,
_rt_username
return
_rt_id
,
_rt_username
return
''
,
''
def
getThumbnail
(
tw
):
"""Get Thumbnail
"""
divs
=
tw
.
find_all
(
"div"
,
"PlayableMedia-player"
)
thumb
=
""
for
div
in
divs
:
thumb
=
div
.
attrs
[
"style"
]
.
split
(
"url('"
)[
-
1
]
thumb
=
thumb
.
replace
(
"')"
,
""
)
return
thumb
# def getThumbnail(tw):
# """Get Thumbnail
# """
# divs = tw.find_all("div", "PlayableMedia-player")
# thumb = ""
# for div in divs:
# thumb = div.attrs["style"].split("url('")[-1]
# thumb = thumb.replace("')", "")
# return thumb
# def Tweet(tw, config):
# """Create Tweet object
# """
# logme.debug(__name__+':Tweet')
# t = tweet()
# t.id = int(tw["data-item-id"])
# t.id_str = tw["data-item-id"]
# t.conversation_id = tw["data-conversation-id"]
# t.datetime = int(tw.find("span", "_timestamp")["data-time-ms"])
# t.datestamp = strftime("%Y-%m-%d", localtime(t.datetime/1000.0))
# t.timestamp = strftime("%H:%M:%S", localtime(t.datetime/1000.0))
# t.user_id = int(tw["data-user-id"])
# t.user_id_str = tw["data-user-id"]
# t.username = tw["data-screen-name"]
# t.name = tw["data-name"]
# t.place = tw.find("a","js-geo-pivot-link").text.strip() if tw.find("a","js-geo-pivot-link") else ""
# t.timezone = strftime("%z", localtime())
# for img in tw.findAll("img", "Emoji Emoji--forText"):
# img.replaceWith(img["alt"])
# t.mentions = getMentions(tw)
# t.urls = [link.attrs["data-expanded-url"] for link in tw.find_all('a',{'class':'twitter-timeline-link'}) if link.has_attr("data-expanded-url")]
# t.photos = [photo_node.attrs['data-image-url'] for photo_node in tw.find_all("div", "AdaptiveMedia-photoContainer")]
# t.video = 1 if tw.find_all("div", "AdaptiveMedia-video") != [] else 0
# t.thumbnail = getThumbnail(tw)
# t.tweet = getText(tw)
# t.lang = tw.find('p', 'tweet-text')['lang']
# t.hashtags = [hashtag.text for hashtag in tw.find_all("a","twitter-hashtag")]
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
# t.replies_count = getStat(tw, "reply")
# t.retweets_count = getStat(tw, "retweet")
# t.likes_count = getStat(tw, "favorite")
# t.link = f"https://twitter.com/{t.username}/status/{t.id}"
# t.user_rt_id, t.user_rt = getRetweet(tw, config)
# t.retweet = True if t.user_rt else False
# t.retweet_id = ''
# t.retweet_date = ''
# if not config.Profile:
# t.retweet_id = tw['data-retweet-id'] if t.user_rt else ''
# t.retweet_date = datetime.fromtimestamp(((int(t.retweet_id) >> 22) + 1288834974657)/1000.0).strftime("%Y-%m-%d %H:%M:%S") if t.user_rt else ''
# t.quote_url = getQuoteURL(tw)
# t.near = config.Near if config.Near else ""
# t.geo = config.Geo if config.Geo else ""
# t.source = config.Source if config.Source else ""
# t.reply_to = [{'user_id': t['id_str'], 'username': t['screen_name']} for t in json.loads(tw["data-reply-to-users-json"])]
# t.translate = ''
# t.trans_src = ''
# t.trans_dest = ''
# if config.Translate == True:
# try:
# ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
# t.translate = ts.text
# t.trans_src = ts.src
# t.trans_dest = ts.dest
# # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
# except ValueError as e:
# raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
# logme.debug(__name__+':Tweet:translator.translate:'+str(e))
# return t
def
Tweet
(
tw
,
config
):
"""Create Tweet object
"""
logme
.
debug
(
__name__
+
':Tweet'
)
logme
.
debug
(
__name__
+
':Tweet'
)
t
=
tweet
()
t
.
id
=
int
(
tw
[
"data-item-id"
])
t
.
id_str
=
tw
[
"data-item-id"
]
t
.
conversation_id
=
tw
[
"data-conversation-id"
]
t
.
datetime
=
int
(
tw
.
find
(
"span"
,
"_timestamp"
)[
"data-time-ms"
])
t
.
datestamp
=
strftime
(
"
%
Y-
%
m-
%
d"
,
localtime
(
t
.
datetime
/
1000.0
))
t
.
timestamp
=
strftime
(
"
%
H:
%
M:
%
S"
,
localtime
(
t
.
datetime
/
1000.0
))
t
.
user_id
=
int
(
tw
[
"data-user-id"
])
t
.
user_id_str
=
tw
[
"data-user-id"
]
t
.
username
=
tw
[
"data-screen-name"
]
t
.
name
=
tw
[
"data-name"
]
t
.
place
=
tw
.
find
(
"a"
,
"js-geo-pivot-link"
)
.
text
.
strip
()
if
tw
.
find
(
"a"
,
"js-geo-pivot-link"
)
else
""
t
.
id
=
int
(
tw
[
'id_str'
])
t
.
id_str
=
tw
[
"id_str"
]
t
.
conversation_id
=
tw
[
"conversation_id_str"
]
# parsing date to user-friendly format
_dt
=
tw
[
'created_at'
]
_dt
=
datetime
.
strptime
(
_dt
,
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
z
%
Y'
)
_dt
=
utc_to_local
(
_dt
)
t
.
datetime
=
str
(
_dt
.
strftime
(
'
%
d-
%
m-
%
Y
%
H:
%
M:
%
S
%
Z'
))
# date is of the format year,
t
.
datestamp
=
_dt
.
strftime
(
'
%
d-
%
m-
%
Y'
)
t
.
timestamp
=
_dt
.
strftime
(
'
%
H:
%
M:
%
S'
)
t
.
user_id
=
int
(
tw
[
"user_id_str"
])
t
.
user_id_str
=
tw
[
"user_id_str"
]
t
.
username
=
tw
[
"user_data"
][
'screen_name'
]
t
.
name
=
tw
[
"user_data"
][
'name'
]
t
.
place
=
tw
[
'geo'
]
if
tw
[
'geo'
]
else
""
t
.
timezone
=
strftime
(
"
%
z"
,
localtime
())
for
img
in
tw
.
findAll
(
"img"
,
"Emoji Emoji--forText"
):
img
.
replaceWith
(
img
[
"alt"
])
t
.
mentions
=
getMentions
(
tw
)
t
.
urls
=
[
link
.
attrs
[
"data-expanded-url"
]
for
link
in
tw
.
find_all
(
'a'
,{
'class'
:
'twitter-timeline-link'
})
if
link
.
has_attr
(
"data-expanded-url"
)]
t
.
photos
=
[
photo_node
.
attrs
[
'data-image-url'
]
for
photo_node
in
tw
.
find_all
(
"div"
,
"AdaptiveMedia-photoContainer"
)]
t
.
video
=
1
if
tw
.
find_all
(
"div"
,
"AdaptiveMedia-video"
)
!=
[]
else
0
t
.
thumbnail
=
getThumbnail
(
tw
)
t
.
tweet
=
getText
(
tw
)
t
.
lang
=
tw
.
find
(
'p'
,
'tweet-text'
)[
'lang'
]
t
.
hashtags
=
[
hashtag
.
text
for
hashtag
in
tw
.
find_all
(
"a"
,
"twitter-hashtag"
)]
t
.
cashtags
=
[
cashtag
.
text
for
cashtag
in
tw
.
find_all
(
"a"
,
"twitter-cashtag"
)]
t
.
replies_count
=
getStat
(
tw
,
"reply"
)
t
.
retweets_count
=
getStat
(
tw
,
"retweet"
)
t
.
likes_count
=
getStat
(
tw
,
"favorite"
)
# for img in tw.findAll("img", "Emoji Emoji--forText"):
# img.replaceWith(img["alt"])
try
:
t
.
mentions
=
[
_mention
[
'screen_name'
]
for
_mention
in
tw
[
'entities'
][
'user_mentions'
]]
except
KeyError
:
t
.
mentions
=
[]
try
:
t
.
urls
=
[
_url
[
'expanded_url'
]
for
_url
in
tw
[
'entities'
][
'urls'
]]
except
KeyError
:
t
.
urls
=
[]
try
:
t
.
photos
=
[
_img
[
'media_url_https'
]
for
_img
in
tw
[
'entities'
][
'media'
]
if
_img
[
'type'
]
==
'photo'
and
_img
[
'expanded_url'
]
.
find
(
'/photo/'
)
!=
-
1
]
except
KeyError
:
t
.
photos
=
[]
try
:
t
.
video
=
1
if
len
(
tw
[
'extended_entities'
][
'media'
])
else
0
except
KeyError
:
t
.
video
=
0
try
:
t
.
thumbnail
=
tw
[
'extended_entities'
][
'media'
][
0
][
'media_url_https'
]
except
KeyError
:
t
.
thumbnail
=
''
t
.
tweet
=
tw
[
'full_text'
]
t
.
lang
=
tw
[
'lang'
]
try
:
t
.
hashtags
=
[
hashtag
[
'text'
]
for
hashtag
in
tw
[
'entities'
][
'hashtags'
]]
except
KeyError
:
t
.
hashtags
=
[]
# don't know what this is
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
t
.
replies_count
=
tw
[
'reply_count'
]
t
.
retweets_count
=
tw
[
'retweet_count'
]
t
.
likes_count
=
tw
[
'favorite_count'
]
t
.
link
=
f
"https://twitter.com/{t.username}/status/{t.id}"
t
.
user_rt_id
,
t
.
user_rt
=
getRetweet
(
tw
,
config
)
t
.
retweet
=
True
if
t
.
user_rt
else
False
t
.
retweet_id
=
''
t
.
retweet_date
=
''
if
not
config
.
Profile
:
t
.
retweet_id
=
tw
[
'data-retweet-id'
]
if
t
.
user_rt
else
''
t
.
retweet_date
=
datetime
.
fromtimestamp
(((
int
(
t
.
retweet_id
)
>>
22
)
+
1288834974657
)
/
1000.0
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
t
.
user_rt
else
''
t
.
quote_url
=
getQuoteURL
(
tw
)
# TODO: someone who is familiar with this code, needs to take a look at what this is
# t.user_rt_id, t.user_rt = getRetweet(tw, config)
# t.retweet = True if t.user_rt else False
# t.retweet_id = ''
# t.retweet_date = ''
# if not config.Profile:
# t.retweet_id = tw['data-retweet-id'] if t.user_rt else ''
# t.retweet_date = datetime.fromtimestamp(((int(t.retweet_id) >> 22) + 1288834974657) / 1000.0).strftime(
# "%Y-%m-%d %H:%M:%S") if t.user_rt else ''
try
:
t
.
quote_url
=
tw
[
'quoted_status_permalink'
][
'expanded'
]
if
tw
[
'is_quote_status'
]
else
''
except
KeyError
:
# means that the quoted tweet have been deleted
t
.
quote_url
=
0
t
.
near
=
config
.
Near
if
config
.
Near
else
""
t
.
geo
=
config
.
Geo
if
config
.
Geo
else
""
t
.
source
=
config
.
Source
if
config
.
Source
else
""
t
.
reply_to
=
[{
'user_id'
:
t
[
'id_str'
],
'username'
:
t
[
'screen_name'
]}
for
t
in
json
.
loads
(
tw
[
"data-reply-to-users-json"
])]
# TODO: check this whether we need the list of all the users to whom this tweet is a reply or we only need
# the immediately above user id
t
.
reply_to
=
{
'user_id'
:
tw
[
'in_reply_to_user_id_str'
],
'username'
:
tw
[
'in_reply_to_screen_name'
]}
t
.
translate
=
''
t
.
trans_src
=
''
t
.
trans_dest
=
''
...
...
@@ -140,5 +248,5 @@ def Tweet(tw, config):
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
except
ValueError
as
e
:
raise
Exception
(
"Invalid destination language: {} / Tweet: {}"
.
format
(
config
.
TranslateDest
,
t
.
tweet
))
logme
.
debug
(
__name__
+
':Tweet:translator.translate:'
+
str
(
e
))
logme
.
debug
(
__name__
+
':Tweet:translator.translate:'
+
str
(
e
))
return
t
twint/url.py
View file @
2d638de0
import
datetime
from
sys
import
platform
import
logging
as
logme
from
urllib.parse
import
urlencode
from
urllib.parse
import
quote
mobile
=
"https://mobile.twitter.com"
base
=
"https://twitter.com/i"
# base = "https://twitter.com/i"
base
=
"https://api.twitter.com/2/search/adaptive.json"
def
_sanitizeQuery
(
base
,
params
):
def
_sanitizeQuery
(
_url
,
params
):
_serialQuery
=
""
for
p
in
params
:
_serialQuery
+=
p
[
0
]
+
"="
+
p
[
1
]
+
"&"
_serialQuery
=
base
+
"?"
+
_serialQuery
[:
-
1
]
.
replace
(
":"
,
"
%3
A"
)
.
replace
(
" "
,
"
%20
"
)
_serialQuery
=
urlencode
(
params
,
quote_via
=
quote
)
_serialQuery
=
_url
+
"?"
+
_serialQuery
return
_serialQuery
def
_formatDate
(
date
):
if
"win"
in
platform
:
return
f
'
\"
{date.split()[0]}
\"
'
...
...
@@ -20,8 +24,9 @@ def _formatDate(date):
except
ValueError
:
return
int
(
datetime
.
datetime
.
strptime
(
date
,
"
%
Y-
%
m-
%
d"
)
.
timestamp
())
async
def
Favorites
(
username
,
init
):
logme
.
debug
(
__name__
+
':Favorites'
)
logme
.
debug
(
__name__
+
':Favorites'
)
url
=
f
"{mobile}/{username}/favorites?lang=en"
if
init
!=
'-1'
:
...
...
@@ -29,8 +34,9 @@ async def Favorites(username, init):
return
url
async
def
Followers
(
username
,
init
):
logme
.
debug
(
__name__
+
':Followers'
)
logme
.
debug
(
__name__
+
':Followers'
)
url
=
f
"{mobile}/{username}/followers?lang=en"
if
init
!=
'-1'
:
...
...
@@ -38,8 +44,9 @@ async def Followers(username, init):
return
url
async
def
Following
(
username
,
init
):
logme
.
debug
(
__name__
+
':Following'
)
logme
.
debug
(
__name__
+
':Following'
)
url
=
f
"{mobile}/{username}/following?lang=en"
if
init
!=
'-1'
:
...
...
@@ -47,8 +54,9 @@ async def Following(username, init):
return
url
async
def
MobileProfile
(
username
,
init
):
logme
.
debug
(
__name__
+
':MobileProfile'
)
logme
.
debug
(
__name__
+
':MobileProfile'
)
url
=
f
"{mobile}/{username}?lang=en"
if
init
!=
'-1'
:
...
...
@@ -56,8 +64,9 @@ async def MobileProfile(username, init):
return
url
async
def
Profile
(
username
,
init
):
logme
.
debug
(
__name__
+
':Profile'
)
logme
.
debug
(
__name__
+
':Profile'
)
url
=
f
"{base}/profiles/show/{username}/timeline/tweets?include_"
url
+=
"available_features=1&lang=en&include_entities=1"
url
+=
"&include_new_items_bar=true"
...
...
@@ -67,17 +76,38 @@ async def Profile(username, init):
return
url
async
def
Search
(
config
,
init
):
logme
.
debug
(
__name__
+
':Search'
)
url
=
f
"{base}/search/timeline"
logme
.
debug
(
__name__
+
':Search'
)
url
=
base
tweet_count
=
100
q
=
""
params
=
[
(
'vertical'
,
'default'
),
(
'src'
,
'unkn'
),
(
'include_available_features'
,
'1'
),
(
'include_entities'
,
'1'
),
(
'max_position'
,
str
(
init
)),
(
'reset_error_state'
,
'false'
),
# ('include_blocking', '1'),
# ('include_blocked_by', '1'),
# ('include_followed_by', '1'),
# ('include_want_retweets', '1'),
# ('include_mute_edge', '1'),
# ('include_can_dm', '1'),
(
'include_can_media_tag'
,
'1'
),
# ('skip_status', '1'),
# ('include_cards', '1'),
(
'include_ext_alt_text'
,
'true'
),
(
'include_quote_count'
,
'true'
),
(
'include_reply_count'
,
'1'
),
(
'tweet_mode'
,
'extended'
),
(
'include_entities'
,
'true'
),
(
'include_user_entities'
,
'true'
),
(
'include_ext_media_availability'
,
'true'
),
(
'send_error_codes'
,
'true'
),
(
'simple_quoted_tweet'
,
'true'
),
(
'count'
,
tweet_count
),
# ('query_source', 'typed_query'),
# ('pc', '1'),
(
'cursor'
,
str
(
init
)),
(
'spelling_corrections'
,
'1'
),
(
'ext'
,
'mediaStats
%2
ChighlightedLabel'
),
(
'tweet_search_mode'
,
'live'
),
# this can be handled better, maybe take an argument and set it then
]
if
not
config
.
Popular_tweets
:
params
.
append
((
'f'
,
'tweets'
))
...
...
@@ -92,7 +122,8 @@ async def Search(config, init):
config
.
Geo
=
config
.
Geo
.
replace
(
" "
,
""
)
q
+=
f
" geocode:{config.Geo}"
if
config
.
Search
:
q
+=
f
" {config.Search}"
q
+=
f
"{config.Search}"
if
config
.
Year
:
q
+=
f
" until:{config.Year}-1-1"
if
config
.
Since
:
...
...
@@ -120,6 +151,7 @@ async def Search(config, init):
q
+=
" filter:media"
if
config
.
Replies
:
q
+=
" filter:replies"
# although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
if
config
.
Native_retweets
:
q
+=
" filter:nativeretweets"
if
config
.
Min_likes
:
...
...
@@ -144,3 +176,43 @@ async def Search(config, init):
params
.
append
((
"q"
,
q
))
_serialQuery
=
_sanitizeQuery
(
url
,
params
)
return
url
,
params
,
_serialQuery
# maybe dont need this
async
def
SearchProfile
(
config
,
init
=
None
):
logme
.
debug
(
__name__
+
':SearchProfile'
)
_url
=
'https://api.twitter.com/2/timeline/profile/{}.json?'
q
=
""
params
=
[
(
'include_profile_interstitial_type'
,
'1'
),
(
'include_blocking'
,
'1'
),
(
'include_blocked_by'
,
'1'
),
(
'include_followed_by'
,
'1'
),
(
'include_want_retweets'
,
'1'
),
(
'include_mute_edge'
,
'1'
),
(
'include_can_dm'
,
'1'
),
(
'include_can_media_tag'
,
'1'
),
(
'skip_status'
,
'1'
),
(
'cards_platform'
,
'Web - 12'
),
(
'include_cards'
,
'1'
),
(
'include_ext_alt_text'
,
'true'
),
(
'include_quote_count'
,
'true'
),
(
'include_reply_count'
,
'1'
),
(
'tweet_mode'
,
'extended'
),
(
'include_entities'
,
'true'
),
(
'include_user_entities'
,
'true'
),
(
'include_ext_media_color'
,
'true'
),
(
'include_ext_media_availability'
,
'true'
),
(
'send_error_codes'
,
'true'
),
(
'simple_quoted_tweet'
,
'true'
),
(
'include_tweet_replies'
,
'false'
),
(
'count'
,
'50'
),
(
'userId'
,
'1934388686'
),
(
'ext'
,
'mediaStats,ChighlightedLabel'
),
]
if
init
:
params
.
append
((
'cursor'
,
init
))
_serialQuery
=
_sanitizeQuery
(
_url
,
params
)
return
_url
,
params
,
_serialQuery
pass
twint/user.py
View file @
2d638de0
import
datetime
import
logging
as
logme
class
user
:
class
User
:
type
=
"user"
def
__init__
(
self
):
pass
def
inf
(
ur
,
_type
):
logme
.
debug
(
__name__
+
':inf'
)
try
:
group
=
ur
.
find
(
"div"
,
"profile"
)
if
group
==
None
:
group
=
ur
.
find
(
"div"
,
"user-actions btn-group not-following"
)
if
group
==
None
:
group
=
ur
.
find
(
"div"
,
"user-actions btn-group not-following protected"
)
except
Exception
as
e
:
print
(
"Error: "
+
str
(
e
))
if
_type
==
"id"
:
screen_name
=
group
.
find
(
"span"
,
"screen-name"
)
.
text
ret
=
ur
.
find
(
"a"
,
{
"data-screenname"
:
screen_name
})
ret
=
ret
.
get
(
'data-mentioned-user-id'
)
if
ret
is
not
None
else
None
ret
=
""
if
ret
is
None
else
ret
elif
_type
==
"name"
:
ret
=
group
.
find
(
"div"
,
"fullname"
)
.
text
.
split
(
'
\n
'
)[
0
]
elif
_type
==
"username"
:
ret
=
group
.
find
(
"span"
,
"screen-name"
)
.
text
elif
_type
==
"private"
:
ret
=
group
.
find
(
"div"
,
"protected"
)
if
ret
:
ret
=
1
else
:
ret
=
0
return
ret
def
card
(
ur
,
_type
):
logme
.
debug
(
__name__
+
':card'
)
if
_type
==
"bio"
:
try
:
ret
=
ur
.
find
(
"div"
,
"bio"
)
.
text
.
replace
(
"
\n
"
,
" "
)
.
strip
()
except
:
ret
=
""
elif
_type
==
"location"
:
try
:
ret
=
ur
.
find
(
"div"
,
"location"
)
.
text
except
:
ret
=
""
elif
_type
==
"url"
:
try
:
ret
=
ur
.
find
(
"link"
)[
"href"
]
except
:
ret
=
""
return
ret
def
join
(
ur
):
try
:
logme
.
debug
(
__name__
+
':join'
)
jd
=
ur
.
find
(
"span"
,
"ProfileHeaderCard-joinDateText js-tooltip u-dir"
)[
"title"
]
return
jd
.
split
(
" - "
)
except
:
return
[
""
,
""
]
def
convertToInt
(
x
):
logme
.
debug
(
__name__
+
':contertToInt'
)
multDict
=
{
"k"
:
1000
,
"m"
:
1000000
,
"b"
:
1000000000
,
}
try
:
if
','
in
x
:
x
=
x
.
replace
(
','
,
''
)
y
=
int
(
x
)
return
y
except
:
pass
try
:
y
=
float
(
str
(
x
)[:
-
1
])
y
=
y
*
multDict
[
str
(
x
)[
-
1
:]
.
lower
()]
return
int
(
y
)
except
:
pass
return
0
def
stat
(
ur
,
_type
):
logme
.
debug
(
__name__
+
':stat'
)
stats
=
ur
.
find
(
'table'
,
'profile-stats'
)
stat_dict
=
{}
for
stat
in
stats
.
find_all
(
'td'
,
'stat'
):
statnum
,
statlabel
=
stat
.
text
.
replace
(
'
\n
'
,
''
)
.
replace
(
','
,
''
)
.
split
(
' '
)[:
2
]
stat_dict
[
statlabel
.
lower
()]
=
int
(
statnum
.
replace
(
','
,
''
))
try
:
return
stat_dict
[
_type
]
except
AttributeError
:
return
0
def
media
(
ur
):
logme
.
debug
(
__name__
+
':media'
)
try
:
media_count
=
ur
.
find
(
"a"
,
"PhotoRail-headingWithCount js-nav"
)
.
text
.
strip
()
.
split
(
" "
)[
0
]
return
convertToInt
(
media_count
)
except
:
return
0
def
verified
(
ur
):
logme
.
debug
(
__name__
+
':verified'
)
try
:
is_verified
=
ur
.
find
(
"img"
,
{
"alt"
:
"Verified Account"
})[
'alt'
]
if
"Verified Account"
in
is_verified
:
is_verified
=
1
else
:
is_verified
=
0
except
:
is_verified
=
0
return
is_verified
# ur object must be a json from the endpoint https://api.twitter.com/graphql
def
User
(
ur
):
logme
.
debug
(
__name__
+
':User'
)
u
=
user
()
for
img
in
ur
.
findAll
(
"img"
,
"Emoji Emoji--forText"
):
img
.
replaceWith
(
img
[
"alt"
])
u
.
id
=
inf
(
ur
,
"id"
)
u
.
name
=
inf
(
ur
,
"name"
)
u
.
username
=
inf
(
ur
,
"username"
)
u
.
bio
=
card
(
ur
,
"bio"
)
u
.
location
=
card
(
ur
,
"location"
)
u
.
url
=
card
(
ur
,
"url"
)
u
.
join_date
=
join
(
ur
)[
1
]
u
.
join_time
=
join
(
ur
)[
0
]
u
.
tweets
=
stat
(
ur
,
"tweets"
)
u
.
following
=
stat
(
ur
,
"following"
)
u
.
followers
=
stat
(
ur
,
"followers"
)
u
.
likes
=
""
# stat(ur, "favorites")
u
.
media_count
=
""
# media(ur)
u
.
is_private
=
inf
(
ur
,
"private"
)
u
.
is_verified
=
verified
(
ur
)
u
.
avatar
=
ur
.
find
(
"img"
,
{
"alt"
:
u
.
name
})[
"src"
]
#u.background_image = ur.find('div',{'class':'ProfileCanopy-headerBg'}).find('img').get('src')
return
u
logme
.
debug
(
__name__
+
':User'
)
if
'data'
not
in
ur
and
'user'
not
in
ur
[
'data'
]:
msg
=
'malformed json! cannot be parsed to get user data'
logme
.
fatal
(
msg
)
raise
KeyError
(
msg
)
_usr
=
User
()
_usr
.
id
=
ur
[
'data'
][
'user'
][
'rest_id'
]
_usr
.
name
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'name'
]
_usr
.
username
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'screen_name'
]
_usr
.
bio
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'description'
]
_usr
.
location
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'location'
]
_usr
.
url
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'screen_name'
][
'url'
]
# parsing date to user-friendly format
_dt
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'created_at'
]
_dt
=
datetime
.
datetime
.
strptime
(
_dt
,
'
%
a
%
b
%
d
%
H:
%
M:
%
S
%
z
%
Y'
)
# date is of the format year,
_usr
.
join_date
=
_dt
.
strftime
(
'
%
d-
%
m-
%
Y'
)
_usr
.
join_time
=
_dt
.
strftime
(
'
%
H:
%
M:
%
S
%
Z'
)
# :type `int`
_usr
.
tweets
=
int
(
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'statuses_count'
])
_usr
.
following
=
int
(
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'friends_count'
])
_usr
.
followers
=
int
(
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'followers_count'
])
_usr
.
likes
=
int
(
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'favourites_count'
])
_usr
.
media_count
=
int
(
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'media_count'
])
_usr
.
is_private
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'protected'
]
_usr
.
is_verified
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'verified'
]
_usr
.
avatar
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'profile_image_url_https'
]
_usr
.
background_image
=
ur
[
'data'
][
'user'
][
'rest_id'
][
'legacy'
][
'profile_banner_url'
]
# TODO : future implementation
# legacy_extended_profile is also available in some cases which can be used to get DOB of user
return
_usr
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment