Commit ee71398d authored by AFPMedialab's avatar AFPMedialab Committed by Francesco Poldi

Timestamp support & ES mapping add-ons (#537)

* Added time support for --until and --since

* Add urls & hashtags normilazer

Add url in the store model
Add hastash analyser  that change all hastash to lowercase

* Remove WARNING:fake_useragent

Remove fake_useragent warning
WARNING:fake_useragent:Error occurred during loading data. Trying to use cache server https://fake-useragent.herokuapp.com/browsers/0.1.11

* Added timedelta function

* Update version

* Update setup.py

* Update README
parent 7168724d
...@@ -67,7 +67,8 @@ A few simple examples to help you understand the basics: ...@@ -67,7 +67,8 @@ A few simple examples to help you understand the basics:
- `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_. - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
- `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets. - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
- `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014. - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20. - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
- `twint -u username -o file.txt` - Scrape Tweets and save to file.txt. - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
- `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file. - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
- `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses. - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
...@@ -179,4 +180,4 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail ...@@ -179,4 +180,4 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail
## Contact ## Contact
If you have any questions, want to discuss, or need extra help, you are welcome to join our Twint focused group at [OSINT Team](https://osint.team) (there is a specific Twint channel) If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
#!/usr/bin/python3
from setuptools import setup
import io
import os
import sys
# Package meta-data
NAME = 'twint'
DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
URL = 'https://github.com/twintproject/twint'
EMAIL = 'codyzacharias@pm.me'
AUTHOR = 'Cody Zacharias'
REQUIRES_PYTHON = '>=3.6.0'
VERSION = None
# Packages required
REQUIRED = [
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet',
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
'schedule', 'geopy', 'fake-useragent'
]
here = os.path.abspath(os.path.dirname(__file__))
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = '\n' + f.read()
# Load the package's __version__.py
about = {}
if not VERSION:
with open(os.path.join(here, NAME, '__version__.py')) as f:
exec(f.read(), about)
else:
about['__version__'] = VERSION
setup( setup(
name=NAME, name=NAME,
version=about['__version__'], version=about['__version__'],
......
...@@ -65,7 +65,7 @@ def main(): ...@@ -65,7 +65,7 @@ def main():
for run in runs[2:]: for run in runs[2:]:
if run == twint.run.Search: if run == twint.run.Search:
c.Since = "2012-1-1" c.Since = "2012-1-1 20:30:22"
c.Until = "2017-1-1" c.Until = "2017-1-1"
else: else:
c.Since = "" c.Since = ""
......
VERSION = (2, 1, 2) VERSION = (2, 1, 3)
__version__ = '.'.join(map(str, VERSION)) __version__ = '.'.join(map(str, VERSION))
...@@ -11,6 +11,7 @@ Copyright (c) 2018 The Twint Project ...@@ -11,6 +11,7 @@ Copyright (c) 2018 The Twint Project
import sys import sys
import os import os
import argparse import argparse
from datetime import timedelta
from . import run from . import run
from . import config from . import config
...@@ -61,6 +62,13 @@ def loadUserList(ul, _type): ...@@ -61,6 +62,13 @@ def loadUserList(ul, _type):
return un[15:] return un[15:]
return userlist return userlist
def getTimeDelta(arg):
if arg:
return timedelta(days=int(arg))
return None
def initialize(args): def initialize(args):
""" Set default values for config from args """ Set default values for config from args
""" """
...@@ -74,7 +82,7 @@ def initialize(args): ...@@ -74,7 +82,7 @@ def initialize(args):
c.Lang = args.lang c.Lang = args.lang
c.Output = args.output c.Output = args.output
c.Elasticsearch = args.elasticsearch c.Elasticsearch = args.elasticsearch
c.Timedelta = args.timedelta c.Timedelta = getTimeDelta(args.timedelta)
c.Year = args.year c.Year = args.year
c.Since = args.since c.Since = args.since
c.Until = args.until c.Until = args.until
...@@ -141,9 +149,9 @@ def options(): ...@@ -141,9 +149,9 @@ def options():
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.") ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
ap.add_argument("-t", "--timedelta", help="Time interval for every request.") ap.add_argument("-t", "--timedelta", help="Time interval for every request.")
ap.add_argument("--year", help="Filter Tweets before specified year.") ap.add_argument("--year", help="Filter Tweets before specified year.")
ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).", ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
metavar="DATE") metavar="DATE")
ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).", ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
metavar="DATE") metavar="DATE")
ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true") ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true") ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
......
...@@ -2,25 +2,38 @@ import datetime ...@@ -2,25 +2,38 @@ import datetime
import logging as logme import logging as logme
class Datelock: class Datelock:
_until = None _until = None
_since = None _since = None
_since_def_user = None _since_def_user = None
def convertToDateTime(string):
dateTimeList = string.split()
ListLength = len(dateTimeList)
if ListLength == 2:
return string
if ListLength == 1:
return string + " 00:00:00"
else:
return ""
def Set(Until, Since): def Set(Until, Since):
logme.debug(__name__+':Set') logme.debug(__name__+':Set')
d = Datelock() d = Datelock()
if Until: if Until:
d._until = datetime.datetime.strptime(Until, "%Y-%m-%d").date() d._until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
else: else:
d._until = datetime.date.today() d._until = datetime.datetime.today()
if Since: if Since:
d._since = datetime.datetime.strptime(Since, "%Y-%m-%d").date() d._since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
d._since_def_user = True d._since_def_user = True
else: else:
d._since = datetime.datetime.strptime("2006-03-21", "%Y-%m-%d").date() d._since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
d._since_def_user = False d._since_def_user = False
return d return d
...@@ -165,7 +165,7 @@ async def Response(session, url, params=[]): ...@@ -165,7 +165,7 @@ async def Response(session, url, params=[]):
async def RandomUserAgent(): async def RandomUserAgent():
logme.debug(__name__+':RandomUserAgent') logme.debug(__name__+':RandomUserAgent')
try: try:
ua = UserAgent() ua = UserAgent(verify_ssl=False, use_cache_server=False)
return ua.random return ua.random
except: except:
return random.choice(user_agent_list) return random.choice(user_agent_list)
......
...@@ -21,14 +21,14 @@ def clean_follow_list(): ...@@ -21,14 +21,14 @@ def clean_follow_list():
global _follows_object global _follows_object
_follows_object = {} _follows_object = {}
def datecheck(datestamp, config): def datecheck(datetimestamp, config):
logme.debug(__name__+':datecheck') logme.debug(__name__+':datecheck')
if config.Since and config.Until: if config.Since and config.Until:
logme.debug(__name__+':datecheck:dateRangeTrue') logme.debug(__name__+':datecheck:dateRangeTrue')
d = int(datestamp.replace("-", "")) d = int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").strftime('%s'))
s = int(config.Since.replace("-", "")) s = int(datetime.strptime(config.Since, "%Y-%m-%d %H:%M:%S").strftime('%s'))
if d < s: if d < s:
return False return False
logme.debug(__name__+':datecheck:dateRangeFalse') logme.debug(__name__+':datecheck:dateRangeFalse')
return True return True
...@@ -104,7 +104,7 @@ async def checkData(tweet, config, conn): ...@@ -104,7 +104,7 @@ async def checkData(tweet, config, conn):
print("[x] Hidden tweet found, account suspended due to violation of TOS") print("[x] Hidden tweet found, account suspended due to violation of TOS")
return return
if datecheck(tweet.datestamp, config): if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
output = format.Tweet(config, tweet) output = format.Tweet(config, tweet)
if config.Database: if config.Database:
......
...@@ -36,12 +36,12 @@ class Twint: ...@@ -36,12 +36,12 @@ class Twint:
storage.panda.clean() storage.panda.clean()
if not self.config.Timedelta: if not self.config.Timedelta:
if (self.d._until - self.d._since).days > 30: if (self.d._until - self.d._since) > timedelta(days=30):
self.config.Timedelta = 30 self.config.Timedelta = timedelta(days=30)
logme.debug(__name__+':Twint:__init__:timedelta_fixed') logme.debug(__name__+':Twint:__init__:timedelta_fixed')
else: else:
logme.debug(__name__+':Twint:__init__:timedelta_unfixed') logme.debug(__name__+':Twint:__init__:timedelta_unfixed')
self.config.Timedelta = (self.d._until - self.d._since).days self.config.Timedelta = (self.d._until - self.d._since)
def get_resume(self, resumeFile): def get_resume(self, resumeFile):
if not os.path.exists(resumeFile): if not os.path.exists(resumeFile):
...@@ -166,7 +166,7 @@ class Twint: ...@@ -166,7 +166,7 @@ class Twint:
if self.config.TwitterSearch and self.config.Since and self.config.Until: if self.config.TwitterSearch and self.config.Since and self.config.Until:
logme.debug(__name__+':Twint:main:search+since+until') logme.debug(__name__+':Twint:main:search+since+until')
_days = timedelta(days=int(self.config.Timedelta)) _days = self.config.Timedelta
while self.d._since < self.d._until: while self.d._since < self.d._until:
self.config.Since = str(self.d._until - _days) self.config.Since = str(self.d._until - _days)
self.config.Until = str(self.d._until) self.config.Until = str(self.d._until)
......
...@@ -64,7 +64,7 @@ def createIndex(config, instance, **scope): ...@@ -64,7 +64,7 @@ def createIndex(config, instance, **scope):
"place": {"type": "keyword"}, "place": {"type": "keyword"},
"location": {"type": "keyword"}, "location": {"type": "keyword"},
"tweet": {"type": "text"}, "tweet": {"type": "text"},
"hashtags": {"type": "keyword"}, "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
"cashtags": {"type": "keyword"}, "cashtags": {"type": "keyword"},
"user_id_str": {"type": "keyword"}, "user_id_str": {"type": "keyword"},
"username": {"type": "keyword"}, "username": {"type": "keyword"},
...@@ -98,10 +98,20 @@ def createIndex(config, instance, **scope): ...@@ -98,10 +98,20 @@ def createIndex(config, instance, **scope):
} }
}, },
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"urls": {"type": "keyword"}
} }
}, },
"settings": { "settings": {
"number_of_shards": 1 "number_of_shards": 1,
"analysis": {
"normalizer": {
"hashtag_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
} }
} }
with nostdout(): with nostdout():
...@@ -246,6 +256,11 @@ def Tweet(Tweet, config): ...@@ -246,6 +256,11 @@ def Tweet(Tweet, config):
for mention in Tweet.mentions: for mention in Tweet.mentions:
_mentions.append(mention) _mentions.append(mention)
j_data["_source"].update({"mentions": _mentions}) j_data["_source"].update({"mentions": _mentions})
if Tweet.urls:
_urls = []
for url in Tweet.urls:
_urls.append(url)
j_data["_source"].update({"urls": _urls})
if config.Near or config.Geo: if config.Near or config.Geo:
if not _is_near_def: if not _is_near_def:
__geo = "" __geo = ""
......
import logging as logme import logging as logme
import datetime
mobile = "http://mobile.twitter.com" mobile = "http://mobile.twitter.com"
base = "http://twitter.com/i" base = "http://twitter.com/i"
...@@ -85,9 +86,9 @@ async def Search(config, init): ...@@ -85,9 +86,9 @@ async def Search(config, init):
if config.Year: if config.Year:
q += f" until:{config.Year}-1-1" q += f" until:{config.Year}-1-1"
if config.Since: if config.Since:
q += f" since:{config.Since}" q += " since:" + datetime.datetime.strptime(config.Since, "%Y-%m-%d %H:%M:%S").strftime('%s')
if config.Until: if config.Until:
q += f" until:{config.Until}" q += " until:" + datetime.datetime.strptime(config.Until, "%Y-%m-%d %H:%M:%S").strftime('%s')
if config.Email: if config.Email:
q += ' "mail" OR "email" OR' q += ' "mail" OR "email" OR'
q += ' "gmail" OR "e-mail"' q += ' "gmail" OR "e-mail"'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment