Commit 8ef77c74 authored by Francesco Poldi's avatar Francesco Poldi

Merge branch 'master' of github.com:twintproject/twint

parents 1a4d4a67 681e5b99
...@@ -2,14 +2,17 @@ ...@@ -2,14 +2,17 @@
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class
tweets.db
# C extensions # C extensions
*.so *.so
config.ini
twint/storage/mysql.py
# Node Dependency directories # Node Dependency directories
node_modules/ node_modules/
jspm_packages/ jspm_packages/
tests/
# Distribution / packaging # Distribution / packaging
.Python .Python
env/ env/
......
...@@ -34,7 +34,8 @@ Twitter limits scrolls while browsing the user timeline. This means that with `. ...@@ -34,7 +34,8 @@ Twitter limits scrolls while browsing the user timeline. This means that with `.
- aiohttp_socks; - aiohttp_socks;
- schedule; - schedule;
- geopy; - geopy;
- fake-useragent. - fake-useragent;
- py-googletransx.
## Installing ## Installing
...@@ -172,9 +173,34 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail ...@@ -172,9 +173,34 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail
`twint --userlist inputlist --user-full` `twint --userlist inputlist --user-full`
#### tweet translation (experimental)
> To get 100 english tweets and translate them to italian
`twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
or
```python
import twint
c = twint.Config()
c.Username = "noneprivacy"
c.Limit = 100
c.Store_csv = True
c.Output = "none.csv"
c.Lang = "en"
c.Translate = True
c.TranslateDest = "it"
twint.run.Search(c)
```
Notes:
- [Google translate has some quotas](https://cloud.google.com/translate/quotas)
## Featured Blog Posts: ## Featured Blog Posts:
- [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/) - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
- [Hunting Threats On Twitter](https://www.trendmicro.com/vinfo/us/security/news/cybercrime-and-digital-threats/hunting-threats-on-twitter)
- [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/) - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
- [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f) - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
- [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/) - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
......
...@@ -33,7 +33,10 @@ PUT twinttweets ...@@ -33,7 +33,10 @@ PUT twinttweets
"geo_near": {"type": "geo_point"}, "geo_near": {"type": "geo_point"},
"geo_tweet": {"type": "geo_point"}, "geo_tweet": {"type": "geo_point"},
"photos": {"type": "text"}, "photos": {"type": "text"},
"mentions": {"type": "text"} "mentions": {"type": "text"},
"translation": {"type": "text"},
"trans_src": {"type": "keyword"},
"trans_dev": {"type": "keyword"},
} }
} }
} }
......
...@@ -8,4 +8,5 @@ pandas>=0.23.0 ...@@ -8,4 +8,5 @@ pandas>=0.23.0
aiohttp_socks aiohttp_socks
schedule schedule
geopy geopy
fake-useragent fake-useragent
\ No newline at end of file googletransx
...@@ -17,7 +17,7 @@ VERSION = None ...@@ -17,7 +17,7 @@ VERSION = None
REQUIRED = [ REQUIRED = [
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet',
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks', 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
'schedule', 'geopy', 'fake-useragent' 'schedule', 'geopy', 'fake-useragent', 'googletransx'
] ]
here = os.path.abspath(os.path.dirname(__file__)) here = os.path.abspath(os.path.dirname(__file__))
...@@ -50,6 +50,9 @@ setup( ...@@ -50,6 +50,9 @@ setup(
], ],
}, },
install_requires=REQUIRED, install_requires=REQUIRED,
dependency_links=[
'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
],
license='MIT', license='MIT',
classifiers=[ classifiers=[
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
......
VERSION = (2, 1, 7) VERSION = (2, 1, 8)
__version__ = '.'.join(map(str, VERSION)) __version__ = '.'.join(map(str, VERSION))
...@@ -123,6 +123,8 @@ def initialize(args): ...@@ -123,6 +123,8 @@ def initialize(args):
c.Source = args.source c.Source = args.source
c.Members_list = args.members_list c.Members_list = args.members_list
c.Filter_retweets = args.filter_retweets c.Filter_retweets = args.filter_retweets
c.Translate = args.translate
c.TranslateDest = args.translate_dest
return c return c
def options(): def options():
...@@ -181,11 +183,14 @@ def options(): ...@@ -181,11 +183,14 @@ def options():
ap.add_argument("--profile-full", ap.add_argument("--profile-full",
help="Slow, but effective method of collecting a user's Tweets and RT.", help="Slow, but effective method of collecting a user's Tweets and RT.",
action="store_true") action="store_true")
ap.add_argument("--translate",
help="Get tweets translated by Google Translate.",
action="store_true")
ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
default="en")
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.") ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
ap.add_argument("--pandas-type", ap.add_argument("--pandas-type",
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5") help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
ap.add_argument("--search_name",
help="Name for identify the search like -3dprinter stuff- only for mysql")
ap.add_argument("-it", "--index-tweets", ap.add_argument("-it", "--index-tweets",
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets") help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
ap.add_argument("-if", "--index-follow", ap.add_argument("-if", "--index-follow",
......
...@@ -42,7 +42,6 @@ class Config: ...@@ -42,7 +42,6 @@ class Config:
Index_tweets = "twinttweets" Index_tweets = "twinttweets"
Index_follow = "twintgraph" Index_follow = "twintgraph"
Index_users = "twintuser" Index_users = "twintuser"
Debug = False
Retries_count = 10 Retries_count = 10
Resume = None Resume = None
Images = False Images = False
...@@ -71,4 +70,7 @@ class Config: ...@@ -71,4 +70,7 @@ class Config:
Links = None Links = None
Source = None Source = None
Members_list = None Members_list = None
Filter_retweets = False Filter_retweets = False
\ No newline at end of file Translate = False
TranslateSrc = "en"
TranslateDest = "en"
\ No newline at end of file
...@@ -28,6 +28,9 @@ def Tweet(config, t): ...@@ -28,6 +28,9 @@ def Tweet(config, t):
output = output.replace("{near}", t.near) output = output.replace("{near}", t.near)
output = output.replace("{geo}", t.geo) output = output.replace("{geo}", t.geo)
output = output.replace("{mentions}", ",".join(t.mentions)) output = output.replace("{mentions}", ",".join(t.mentions))
output = output.replace("{translate}", t.translate)
output = output.replace("{trans_src}", t.trans_src)
output = output.replace("{trans_dest}", t.trans_dest)
else: else:
logme.debug(__name__+':Tweet:notFormat') logme.debug(__name__+':Tweet:notFormat')
output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} " output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
...@@ -45,7 +48,8 @@ def Tweet(config, t): ...@@ -45,7 +48,8 @@ def Tweet(config, t):
output += f" {cashtags}" output += f" {cashtags}"
if config.Stats: if config.Stats:
output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes" output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
if config.Translate:
output += f" {t.translate} {t.trans_src} {t.trans_dest}"
return output return output
def User(_format, u): def User(_format, u):
......
...@@ -148,7 +148,7 @@ async def Request(url, connector=None, params=[], headers=[]): ...@@ -148,7 +148,7 @@ async def Request(url, connector=None, params=[], headers=[]):
async def Response(session, url, params=[]): async def Response(session, url, params=[]):
logme.debug(__name__+':Response') logme.debug(__name__+':Response')
with timeout(100): with timeout(120):
async with session.get(url, ssl=True, params=params, proxy=httpproxy) as response: async with session.get(url, ssl=True, params=params, proxy=httpproxy) as response:
return await response.text() return await response.text()
......
...@@ -81,6 +81,9 @@ def init(db): ...@@ -81,6 +81,9 @@ def init(db):
near text, near text,
source text, source text,
time_update integer not null, time_update integer not null,
`translate` text default '',
trans_src text default '',
trans_dest text default '',
PRIMARY KEY (id) PRIMARY KEY (id)
); );
""" """
...@@ -265,8 +268,11 @@ def tweets(conn, Tweet, config): ...@@ -265,8 +268,11 @@ def tweets(conn, Tweet, config):
Tweet.geo, Tweet.geo,
Tweet.near, Tweet.near,
Tweet.source, Tweet.source,
time_ms) time_ms,
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) Tweet.translate,
Tweet.trans_src,
Tweet.trans_dest)
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
if config.Favorites: if config.Favorites:
query = 'INSERT INTO favorites VALUES(?,?)' query = 'INSERT INTO favorites VALUES(?,?)'
......
...@@ -98,7 +98,10 @@ def createIndex(config, instance, **scope): ...@@ -98,7 +98,10 @@ def createIndex(config, instance, **scope):
} }
}, },
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"urls": {"type": "keyword"} "urls": {"type": "keyword"},
"translate": {"type": "text"},
"trans_src": {"type": "keyword"},
"trans_dest": {"type": "keyword"},
} }
}, },
"settings": { "settings": {
...@@ -278,6 +281,11 @@ def Tweet(Tweet, config): ...@@ -278,6 +281,11 @@ def Tweet(Tweet, config):
j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)}) j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
if Tweet.source: if Tweet.source:
j_data["_source"].update({"source": Tweet.Source}) j_data["_source"].update({"source": Tweet.Source})
if config.Translate:
j_data["_source"].update({"translate": Tweet.translate})
j_data["_source"].update({"trans_src": Tweet.trans_src})
j_data["_source"].update({"trans_dest": Tweet.trans_dest})
actions.append(j_data) actions.append(j_data)
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
......
...@@ -96,7 +96,10 @@ def update(object, config): ...@@ -96,7 +96,10 @@ def update(object, config):
"user_rt": Tweet.user_rt, "user_rt": Tweet.user_rt,
"retweet_id": Tweet.retweet_id, "retweet_id": Tweet.retweet_id,
"reply_to": Tweet.reply_to, "reply_to": Tweet.reply_to,
"retweet_date": Tweet.retweet_date "retweet_date": Tweet.retweet_date,
"translate": Tweet.translate,
"trans_src": Tweet.trans_src,
"trans_dest": Tweet.trans_dest
} }
_object_blocks[_type].append(_data) _object_blocks[_type].append(_data)
elif _type == "user": elif _type == "user":
......
...@@ -30,7 +30,10 @@ def tweetData(t): ...@@ -30,7 +30,10 @@ def tweetData(t):
"user_rt": t.user_rt, "user_rt": t.user_rt,
"retweet_id": t.retweet_id, "retweet_id": t.retweet_id,
"reply_to": t.reply_to, "reply_to": t.reply_to,
"retweet_date": t.retweet_date "retweet_date": t.retweet_date,
"translate": t.translate,
"trans_src": t.trans_src,
"trans_dest": t.trans_dest
} }
return data return data
...@@ -66,7 +69,10 @@ def tweetFieldnames(): ...@@ -66,7 +69,10 @@ def tweetFieldnames():
"user_rt", "user_rt",
"retweet_id", "retweet_id",
"reply_to", "reply_to",
"retweet_date" "retweet_date",
"translate",
"trans_src",
"trans_dest"
] ]
return fieldnames return fieldnames
......
...@@ -3,6 +3,11 @@ from datetime import datetime ...@@ -3,6 +3,11 @@ from datetime import datetime
import json import json
import logging as logme import logging as logme
from googletransx import Translator
# ref.
# - https://github.com/x0rzkov/py-googletrans#basic-usage
translator = Translator()
class tweet: class tweet:
"""Define Tweet class """Define Tweet class
...@@ -104,4 +109,17 @@ def Tweet(tw, config): ...@@ -104,4 +109,17 @@ def Tweet(tw, config):
t.geo = config.Geo if config.Geo else "" t.geo = config.Geo if config.Geo else ""
t.source = config.Source if config.Source else "" t.source = config.Source if config.Source else ""
t.reply_to = [{'user_id': t['id_str'], 'username': t['screen_name']} for t in json.loads(tw["data-reply-to-users-json"])] t.reply_to = [{'user_id': t['id_str'], 'username': t['screen_name']} for t in json.loads(tw["data-reply-to-users-json"])]
t.translate = ''
t.trans_src = ''
t.trans_dest = ''
if config.Translate == True:
try:
ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
t.translate = ts.text
t.trans_src = ts.src
t.trans_dest = ts.dest
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
except ValueError as e:
raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
logme.debug(__name__+':Tweet:translator.translate:'+str(e))
return t return t
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment