Commit ee71398d authored by AFPMedialab's avatar AFPMedialab Committed by Francesco Poldi

Timestamp support & ES mapping add-ons (#537)

* Added time support for --until and --since

* Add urls & hashtags normilazer

Add url in the store model
Add hastash analyser  that change all hastash to lowercase

* Remove WARNING:fake_useragent

Remove fake_useragent warning
WARNING:fake_useragent:Error occurred during loading data. Trying to use cache server https://fake-useragent.herokuapp.com/browsers/0.1.11

* Added timedelta function

* Update version

* Update setup.py

* Update README
parent 7168724d
......@@ -67,7 +67,8 @@ A few simple examples to help you understand the basics:
- `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
- `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
- `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20.
- `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
- `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
- `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
- `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
......@@ -179,4 +180,4 @@ Twitter can shadow-ban accounts, which means that their tweets will not be avail
## Contact
If you have any questions, want to discuss, or need extra help, you are welcome to join our Twint focused group at [OSINT Team](https://osint.team) (there is a specific Twint channel)
If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
#!/usr/bin/python3
from setuptools import setup
import io
import os
import sys
# Package meta-data
NAME = 'twint'
DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
URL = 'https://github.com/twintproject/twint'
EMAIL = 'codyzacharias@pm.me'
AUTHOR = 'Cody Zacharias'
REQUIRES_PYTHON = '>=3.6.0'
VERSION = None
# Packages required
REQUIRED = [
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet',
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
'schedule', 'geopy', 'fake-useragent'
]
here = os.path.abspath(os.path.dirname(__file__))
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = '\n' + f.read()
# Load the package's __version__.py
about = {}
if not VERSION:
with open(os.path.join(here, NAME, '__version__.py')) as f:
exec(f.read(), about)
else:
about['__version__'] = VERSION
setup(
name=NAME,
version=about['__version__'],
......
......@@ -65,7 +65,7 @@ def main():
for run in runs[2:]:
if run == twint.run.Search:
c.Since = "2012-1-1"
c.Since = "2012-1-1 20:30:22"
c.Until = "2017-1-1"
else:
c.Since = ""
......
VERSION = (2, 1, 2)
VERSION = (2, 1, 3)
__version__ = '.'.join(map(str, VERSION))
......@@ -11,6 +11,7 @@ Copyright (c) 2018 The Twint Project
import sys
import os
import argparse
from datetime import timedelta
from . import run
from . import config
......@@ -61,6 +62,13 @@ def loadUserList(ul, _type):
return un[15:]
return userlist
def getTimeDelta(arg):
if arg:
return timedelta(days=int(arg))
return None
def initialize(args):
""" Set default values for config from args
"""
......@@ -74,7 +82,7 @@ def initialize(args):
c.Lang = args.lang
c.Output = args.output
c.Elasticsearch = args.elasticsearch
c.Timedelta = args.timedelta
c.Timedelta = getTimeDelta(args.timedelta)
c.Year = args.year
c.Since = args.since
c.Until = args.until
......@@ -141,9 +149,9 @@ def options():
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
ap.add_argument("-t", "--timedelta", help="Time interval for every request.")
ap.add_argument("--year", help="Filter Tweets before specified year.")
ap.add_argument("--since", help="Filter Tweets sent since date (Example: 2017-12-27).",
ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
metavar="DATE")
ap.add_argument("--until", help="Filter Tweets sent until date (Example: 2017-12-27).",
ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
metavar="DATE")
ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
......
......@@ -2,25 +2,38 @@ import datetime
import logging as logme
class Datelock:
_until = None
_since = None
_since_def_user = None
def convertToDateTime(string):
dateTimeList = string.split()
ListLength = len(dateTimeList)
if ListLength == 2:
return string
if ListLength == 1:
return string + " 00:00:00"
else:
return ""
def Set(Until, Since):
logme.debug(__name__+':Set')
d = Datelock()
if Until:
d._until = datetime.datetime.strptime(Until, "%Y-%m-%d").date()
d._until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
else:
d._until = datetime.date.today()
d._until = datetime.datetime.today()
if Since:
d._since = datetime.datetime.strptime(Since, "%Y-%m-%d").date()
d._since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
d._since_def_user = True
else:
d._since = datetime.datetime.strptime("2006-03-21", "%Y-%m-%d").date()
d._since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
d._since_def_user = False
return d
......@@ -165,7 +165,7 @@ async def Response(session, url, params=[]):
async def RandomUserAgent():
logme.debug(__name__+':RandomUserAgent')
try:
ua = UserAgent()
ua = UserAgent(verify_ssl=False, use_cache_server=False)
return ua.random
except:
return random.choice(user_agent_list)
......
......@@ -21,14 +21,14 @@ def clean_follow_list():
global _follows_object
_follows_object = {}
def datecheck(datestamp, config):
def datecheck(datetimestamp, config):
logme.debug(__name__+':datecheck')
if config.Since and config.Until:
logme.debug(__name__+':datecheck:dateRangeTrue')
d = int(datestamp.replace("-", ""))
s = int(config.Since.replace("-", ""))
d = int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").strftime('%s'))
s = int(datetime.strptime(config.Since, "%Y-%m-%d %H:%M:%S").strftime('%s'))
if d < s:
return False
return False
logme.debug(__name__+':datecheck:dateRangeFalse')
return True
......@@ -104,7 +104,7 @@ async def checkData(tweet, config, conn):
print("[x] Hidden tweet found, account suspended due to violation of TOS")
return
if datecheck(tweet.datestamp, config):
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
output = format.Tweet(config, tweet)
if config.Database:
......
......@@ -36,12 +36,12 @@ class Twint:
storage.panda.clean()
if not self.config.Timedelta:
if (self.d._until - self.d._since).days > 30:
self.config.Timedelta = 30
if (self.d._until - self.d._since) > timedelta(days=30):
self.config.Timedelta = timedelta(days=30)
logme.debug(__name__+':Twint:__init__:timedelta_fixed')
else:
logme.debug(__name__+':Twint:__init__:timedelta_unfixed')
self.config.Timedelta = (self.d._until - self.d._since).days
self.config.Timedelta = (self.d._until - self.d._since)
def get_resume(self, resumeFile):
if not os.path.exists(resumeFile):
......@@ -166,7 +166,7 @@ class Twint:
if self.config.TwitterSearch and self.config.Since and self.config.Until:
logme.debug(__name__+':Twint:main:search+since+until')
_days = timedelta(days=int(self.config.Timedelta))
_days = self.config.Timedelta
while self.d._since < self.d._until:
self.config.Since = str(self.d._until - _days)
self.config.Until = str(self.d._until)
......
......@@ -64,7 +64,7 @@ def createIndex(config, instance, **scope):
"place": {"type": "keyword"},
"location": {"type": "keyword"},
"tweet": {"type": "text"},
"hashtags": {"type": "keyword"},
"hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
"cashtags": {"type": "keyword"},
"user_id_str": {"type": "keyword"},
"username": {"type": "keyword"},
......@@ -98,10 +98,20 @@ def createIndex(config, instance, **scope):
}
},
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
"urls": {"type": "keyword"}
}
},
"settings": {
"number_of_shards": 1
"number_of_shards": 1,
"analysis": {
"normalizer": {
"hashtag_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
}
}
with nostdout():
......@@ -246,6 +256,11 @@ def Tweet(Tweet, config):
for mention in Tweet.mentions:
_mentions.append(mention)
j_data["_source"].update({"mentions": _mentions})
if Tweet.urls:
_urls = []
for url in Tweet.urls:
_urls.append(url)
j_data["_source"].update({"urls": _urls})
if config.Near or config.Geo:
if not _is_near_def:
__geo = ""
......
import logging as logme
import datetime
mobile = "http://mobile.twitter.com"
base = "http://twitter.com/i"
......@@ -85,9 +86,9 @@ async def Search(config, init):
if config.Year:
q += f" until:{config.Year}-1-1"
if config.Since:
q += f" since:{config.Since}"
q += " since:" + datetime.datetime.strptime(config.Since, "%Y-%m-%d %H:%M:%S").strftime('%s')
if config.Until:
q += f" until:{config.Until}"
q += " until:" + datetime.datetime.strptime(config.Until, "%Y-%m-%d %H:%M:%S").strftime('%s')
if config.Email:
q += ' "mail" OR "email" OR'
q += ' "gmail" OR "e-mail"'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment