Commit 65239911 authored by Francesco Poldi's avatar Francesco Poldi

New features and new index

parent 1fe95cca
......@@ -15,6 +15,7 @@ Copyright (c) 2018 Cody Zacharias
import argparse
import twint
import sys
import os
def error(error, message):
print("[-] {}: {}".format(error, message))
......@@ -45,7 +46,7 @@ def check(args):
elif args.proxy_type.lower() == "http":
_type = socks.HTTP
else:
error("Error", "Proxy type allower are: socks5, socks4 and http.")
error("Error", "Proxy type allowed are: socks5, socks4 and http.")
import socks, socket
socks.set_default_proxy(_type, args.proxy_host, int(args.proxy_port))
socket.socket = socks.socksocket
......@@ -55,7 +56,15 @@ def check(args):
if args.proxy_port or args.proxy_type:
error("Error", "Please specify --proxy-host, --proxy-port and --proxy-type")
def loadUserList(ul):
if not isinstance(ul, str):
userlist = open(ul, "r").readline()
else:
userlist = ul.split(",")
un = ""
for user in userlist:
un += "%20OR%20from%3A" + user
return un[15:]
def initialize(args):
......@@ -90,6 +99,8 @@ def initialize(args):
c.Proxy_type = args.proxy_type
c.Proxy_host = args.proxy_host
c.Proxy_port = args.proxy_port
c.Essid = args.essid
c.Userlist = args.userlist
return c
def options():
......@@ -125,14 +136,21 @@ def options():
ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
ap.add_argument("--debug", help="Debug mode", action="store_true")
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
ap.add_argument("--proxy-host", help="Proxy hostname or IP")
ap.add_argument("--proxy-port", help="The port of the proxy server")
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
ap.add_argument("--proxy-port", help="The port of the proxy server.")
ap.add_argument("--essid", help="Elasticsearch Session ID, use this to differentiate scraping sessions.")
ap.add_argument("--userlist", help="Userlist from list or file.")
args = ap.parse_args()
return args
def main():
args = options()
check(args)
if args.userlist:
args.username = loadUserList(args.userlist)
print(args.username)
c = initialize(args)
if args.favorites:
......
PUT twint
PUT twint2
{
"mappings" : {
"items": {
......@@ -16,7 +16,8 @@ PUT twint
"username": {"type": "keyword"},
"day": {"type": "keyword"},
"hour": {"type": "keyword"},
"link": {"type": "text"}
"link": {"type": "text"},
"essid": {"type": "keyword"}
}
}
}
......
......@@ -31,3 +31,5 @@ class Config:
Proxy_type = None
Proxy_host = None
Proxy_port = None
Essid = None
Userlist = None
......@@ -2,9 +2,11 @@ from elasticsearch import Elasticsearch, helpers
import contextlib
import datetime
import time
import sys
class RecycleObject(object):
def write(self, junk): pass
def flush(self): pass
@contextlib.contextmanager
def nostdout():
......@@ -26,7 +28,7 @@ def weekday(day):
return weekdays[day]
def Elastic(Tweet, config):
def Tweet(Tweet, es, session):
# Todo play around with this some more
day = weekday(Tweet.date.strftime("%A"))
......@@ -40,7 +42,7 @@ def Elastic(Tweet, config):
j_data = {
"_index": "twint",
"_type": "items",
"_id": Tweet.id + "_raw",
"_id": Tweet.id + "_raw_" + session,
"_source": {
"id": Tweet.id,
"date": dt,
......@@ -52,7 +54,8 @@ def Elastic(Tweet, config):
"username": Tweet.username,
"day": day,
"hour": Tweet.time.strftime("%H"),
"link": Tweet.link
"link": Tweet.link,
"essid": session
}
}
......@@ -62,7 +65,7 @@ def Elastic(Tweet, config):
j_data = {
"_index": "twint",
"_type": "items",
"_id": Tweet.id + "_likes_" + str(nLikes),
"_id": Tweet.id + "_likes_" + str(nLikes) + "_" + session,
"_source": {
"id": Tweet.id,
"date": dt,
......@@ -75,7 +78,8 @@ def Elastic(Tweet, config):
"username": Tweet.username,
"day": day,
"hour": Tweet.time.strftime("%H"),
"link": Tweet.link
"link": Tweet.link,
"essid": session
}
}
......@@ -86,7 +90,7 @@ def Elastic(Tweet, config):
j_data = {
"_index": "twint",
"_type": "items",
"_id": Tweet.id + "_replies_" + str(nReplies),
"_id": Tweet.id + "_replies_" + str(nReplies) + "_" + session,
"_source": {
"id": Tweet.id,
"date": dt,
......@@ -99,7 +103,8 @@ def Elastic(Tweet, config):
"username": Tweet.username,
"day": day,
"hour": Tweet.time.strftime("%H"),
"link": Tweet.link
"link": Tweet.link,
"essid": session
}
}
......@@ -110,7 +115,7 @@ def Elastic(Tweet, config):
j_data = {
"_index": "twint",
"_type": "items",
"_id": Tweet.id + "_retweets_" + str(nRetweets),
"_id": Tweet.id + "_retweets_" + str(nRetweets) + "_" + session,
"_source": {
"id": Tweet.id,
"date": dt,
......@@ -123,14 +128,36 @@ def Elastic(Tweet, config):
"username": Tweet.username,
"day": day,
"hour": Tweet.time.strftime("%H"),
"link": Tweet.link
"link": Tweet.link,
"essid": session
}
}
actions.append(j_data)
nRetweets += 1
es = Elasticsearch(config.Elasticsearch)
es = Elasticsearch(es)
with nostdout():
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
actions = []
def Follow(es, user, follow, session):
actions = []
j_data = {
"_index": "twintgraph2",
"_type": "items",
"_id": user + "_" + follow + "_" + session,
"_source": {
"user": user,
"follow": follow,
"essid": session
}
}
actions.append(j_data)
es = Elasticsearch(es)
with nostdout():
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
actions = []
\ No newline at end of file
from . import feed, get, db, output
from . import feed, get, db, output, elasticsearch
from bs4 import BeautifulSoup
import aiohttp
import asyncio
......@@ -45,6 +45,9 @@ class Followers:
if self.config.Output != None:
output.write(User.name, self.config.Output)
if self.config.Elasticsearch:
elasticsearch.Follow(self.config.Elasticsearch, User.name, self.config.Username, self.config.Essid)
self.count += 1
print(User.name)
......
from . import feed, get, db, output
from . import feed, get, db, output, elasticsearch
from bs4 import BeautifulSoup
import aiohttp
import asyncio
......@@ -45,6 +45,9 @@ class Following:
if self.config.Output != None:
output.write(User.name, self.config.Output)
if self.config.Elasticsearch:
elasticsearch.Follow(self.config.Elasticsearch, self.config.Username, User.name, self.config.Essid)
self.count += 1
print(User.name)
......
......@@ -165,7 +165,7 @@ async def Tweets(tw, location, config, conn):
if config.Database:
db.tweets(conn, Tweet)
if config.Elasticsearch:
elasticsearch.Elastic(Tweet, config)
elasticsearch.Tweet(Tweet, config.Elasticsearch, config.Essid)
if config.Users_only:
output = Tweet.username
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment