Commit aaa37fb8 authored by Cody Zacharias's avatar Cody Zacharias Committed by GitHub

Make CSV Better

parent 79445867
...@@ -7,167 +7,187 @@ import asyncio ...@@ -7,167 +7,187 @@ import asyncio
import csv import csv
import datetime import datetime
import json import json
import os
import re import re
import sys import sys
import time import time
def write(entry, f): def write(entry, f):
print(entry, file=open(f, "a", encoding="utf-8")) print(entry, file=open(f, "a", encoding="utf-8"))
def writeCSV(Tweet, file): def writeCSV(Tweet, file):
data = [ fieldnames = [
Tweet.id, "id",
Tweet.datestamp, "date",
Tweet.timestamp, "time",
Tweet.timezone, "timezone",
Tweet.user_id, "user_id",
Tweet.username, "username",
Tweet.tweet, "tweet",
Tweet.replies, "replies",
Tweet.retweets, "retweets",
Tweet.likes, "likes",
Tweet.location, "location",
",".join(Tweet.hashtags), "hashtags",
Tweet.link] "link"]
with open(file, "a", newline='', encoding="utf-8") as csv_file: row = {
writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL, delimiter=",") "id": Tweet.id,
writer.writerow(data) "date": Tweet.datestamp,
"time": Tweet.timestamp,
"timezone": Tweet.timezone,
"user_id": Tweet.user_id,
"username": Tweet.username,
"tweet": Tweet.tweet,
"replies": Tweet.replies,
"retweets": Tweet.retweets,
"likes": Tweet.likes,
"location": Tweet.location,
"hashtags": Tweet.hashtags,
"link": Tweet.link
}
if not (os.path.exists(file)):
with open(file, "w", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
with open(file, "a", newline='', encoding="utf-8") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writerow(row)
def writeJSON(Tweet, file): def writeJSON(Tweet, file):
data = { data = {
"id": Tweet.id, "id": Tweet.id,
"date": Tweet.datestamp, "date": Tweet.datestamp,
"time": Tweet.timestamp, "time": Tweet.timestamp,
"timezone": Tweet.timezone, "timezone": Tweet.timezone,
"user_id": Tweet.user_id, "user_id": Tweet.user_id,
"username": Tweet.username, "username": Tweet.username,
"tweet": Tweet.tweet, "tweet": Tweet.tweet,
"replies": Tweet.replies, "replies": Tweet.replies,
"retweets": Tweet.retweets, "retweets": Tweet.retweets,
"likes": Tweet.likes, "likes": Tweet.likes,
"location": Tweet.location, "location": Tweet.location,
"hashtags": ",".join(Tweet.hashtags), "hashtags": ",".join(Tweet.hashtags),
"link": Tweet.link} "link": Tweet.link}
with open(file, "a", newline='', encoding="utf-8") as json_file: with open(file, "a", newline='', encoding="utf-8") as json_file:
json.dump(data, json_file) json.dump(data, json_file)
json_file.write("\n") json_file.write("\n")
def getDate(tweet): def getDate(tweet):
datestamp = tweet.find("a", "tweet-timestamp")["title"] datestamp = tweet.find("a", "tweet-timestamp")["title"]
datestamp = datestamp.rpartition(" - ")[-1] datestamp = datestamp.rpartition(" - ")[-1]
return datetime.datetime.strptime(datestamp, "%d %b %Y") return datetime.datetime.strptime(datestamp, "%d %b %Y")
def getTime(tweet): def getTime(tweet):
tm = int(tweet.find("span", "_timestamp")["data-time"]) tm = int(tweet.find("span", "_timestamp")["data-time"])
timestamp = str(datetime.timedelta(seconds=tm)) timestamp = str(datetime.timedelta(seconds=tm))
timestamp = timestamp.rpartition(", ")[-1] timestamp = timestamp.rpartition(", ")[-1]
return datetime.datetime.strptime(timestamp, "%H:%M:%S") return datetime.datetime.strptime(timestamp, "%H:%M:%S")
def getText(tweet): def getText(tweet):
text = tweet.find("p", "tweet-text").text text = tweet.find("p", "tweet-text").text
text = text.replace("\n", "") text = text.replace("\n", "")
text = text.replace("http", " http") text = text.replace("http", " http")
text = text.replace("pic.twitter", " pic.twitter") text = text.replace("pic.twitter", " pic.twitter")
return text return text
def getHashtags(text): def getHashtags(text):
hashtag = re.findall(r'(?i)\#\w+', text, flags=re.UNICODE) hashtag = re.findall(r'(?i)\#\w+', text, flags=re.UNICODE)
return hashtag return hashtag
#return ",".join(hashtag) #return ",".join(hashtag)
def getStat(tweet, stat): def getStat(tweet, stat):
st = "ProfileTweet-action--{} u-hiddenVisually".format(stat) st = "ProfileTweet-action--{} u-hiddenVisually".format(stat)
return tweet.find("span", st).find("span")["data-tweet-stat-count"] return tweet.find("span", st).find("span")["data-tweet-stat-count"]
def getMentions(tweet, text): def getMentions(tweet, text):
try: try:
mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ") mentions = tweet.find("div", "js-original-tweet")["data-mentions"].split(" ")
for i in range(len(mentions)): for i in range(len(mentions)):
mention = "@{}".format(mentions[i]) mention = "@{}".format(mentions[i])
if mention not in text: if mention not in text:
text = "{} {}".format(mention, text) text = "{} {}".format(mention, text)
except: except:
pass pass
return text return text
# Sort HTML # Sort HTML
def getTweet(tw, location, config): def getTweet(tw, location, config):
t = Tweet() t = Tweet()
t.id = tw.find("div")["data-item-id"] t.id = tw.find("div")["data-item-id"]
t.date = getDate(tw) t.date = getDate(tw)
if config.Since and config.Until: if config.Since and config.Until:
if (t.date.date() - datetime.datetime.strptime(config.Since, "%Y-%m-%d").date()).days == -1: if (t.date.date() - datetime.datetime.strptime(config.Since, "%Y-%m-%d").date()).days == -1:
# mitigation here, maybe find something better # mitigation here, maybe find something better
sys.exit(0) sys.exit(0)
t.datestamp = t.date.strftime("%Y-%m-%d") t.datestamp = t.date.strftime("%Y-%m-%d")
t.time = getTime(tw) t.time = getTime(tw)
t.timestamp = t.time.strftime("%H:%M:%S") t.timestamp = t.time.strftime("%H:%M:%S")
t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"] t.user_id = tw.find("a", "account-group js-account-group js-action-profile js-user-profile-link js-nav")["data-user-id"]
t.username = tw.find("span", "username").text.replace("@", "") t.username = tw.find("span", "username").text.replace("@", "")
t.timezone = strftime("%Z", gmtime()) t.timezone = strftime("%Z", gmtime())
for img in tw.findAll("img", "Emoji Emoji--forText"): for img in tw.findAll("img", "Emoji Emoji--forText"):
img.replaceWith("<{}>".format(img['aria-label'])) img.replaceWith("<{}>".format(img['aria-label']))
t.tweet = getMentions(tw, getText(tw)) t.tweet = getMentions(tw, getText(tw))
t.location = location t.location = location
t.hashtags = getHashtags(t.tweet) t.hashtags = getHashtags(t.tweet)
t.replies = getStat(tw, "reply") t.replies = getStat(tw, "reply")
t.retweets = getStat(tw, "retweet") t.retweets = getStat(tw, "retweet")
t.likes = getStat(tw, "favorite") t.likes = getStat(tw, "favorite")
t.link = "https://twitter.com/{0.username}/status/{0.id}/".format(t) t.link = "https://twitter.com/{0.username}/status/{0.id}/".format(t)
return t return t
async def getUser(user): async def getUser(user):
u = User() u = User()
u.name = user.find("a")["name"] u.name = user.find("a")["name"]
return u return u
async def Tweets(tw, location, config, conn): async def Tweets(tw, location, config, conn):
copyright = tw.find("div", "StreamItemContent--withheld") copyright = tw.find("div", "StreamItemContent--withheld")
if copyright is None: if copyright is None:
Tweet = getTweet(tw, location, config) Tweet = getTweet(tw, location, config)
if config.Database: if config.Database:
db.tweets(conn, Tweet) db.tweets(conn, Tweet)
if config.Elasticsearch: if config.Elasticsearch:
elasticsearch.Elastic(Tweet, config) elasticsearch.Elastic(Tweet, config)
if config.Users_only: if config.Users_only:
output = Tweet.username output = Tweet.username
elif config.Tweets_only: elif config.Tweets_only:
output = Tweet.tweet output = Tweet.tweet
elif config.Format: elif config.Format:
output = config.Format.replace("{id}", Tweet.id) output = config.Format.replace("{id}", Tweet.id)
output = output.replace("{date}", Tweet.datestamp) output = output.replace("{date}", Tweet.datestamp)
output = output.replace("{time}", Tweet.timestamp) output = output.replace("{time}", Tweet.timestamp)
output = output.replace("{user_id}", Tweet.user_id) output = output.replace("{user_id}", Tweet.user_id)
output = output.replace("{username}", Tweet.username) output = output.replace("{username}", Tweet.username)
output = output.replace("{timezone}", Tweet.timezone) output = output.replace("{timezone}", Tweet.timezone)
output = output.replace("{tweet}", Tweet.tweet) output = output.replace("{tweet}", Tweet.tweet)
output = output.replace("{location}", Tweet.location) output = output.replace("{location}", Tweet.location)
output = output.replace("{hashtags}", str(Tweet.hashtags)) output = output.replace("{hashtags}", str(Tweet.hashtags))
output = output.replace("{replies}", Tweet.replies) output = output.replace("{replies}", Tweet.replies)
output = output.replace("{retweets}", Tweet.retweets) output = output.replace("{retweets}", Tweet.retweets)
output = output.replace("{likes}", Tweet.likes) output = output.replace("{likes}", Tweet.likes)
output = output.replace("{link}", Tweet.link) output = output.replace("{link}", Tweet.link)
else: else:
output = "{} {} {} {} <{}> {}".format(Tweet.id, Tweet.datestamp, Tweet.timestamp, Tweet.timezone, Tweet.username, Tweet.tweet) output = "{} {} {} {} <{}> {}".format(Tweet.id, Tweet.datestamp, Tweet.timestamp, Tweet.timezone, Tweet.username, Tweet.tweet)
if config.Show_hashtags: if config.Show_hashtags:
output+= " {}".format(",".join(Tweet.hashtags)) output+= " {}".format(",".join(Tweet.hashtags))
if config.Stats: if config.Stats:
output+= " | {} replies {} retweets {} likes".format(Tweet.replies, Tweet.retweets, Tweet.likes) output+= " | {} replies {} retweets {} likes".format(Tweet.replies, Tweet.retweets, Tweet.likes)
if config.Location: if config.Location:
output+= " | Location {}".format(Tweet.location) output+= " | Location {}".format(Tweet.location)
if config.Output != None: if config.Output != None:
if config.Store_csv: if config.Store_csv:
writeCSV(Tweet, config.Output) writeCSV(Tweet, config.Output)
elif config.Store_json: elif config.Store_json:
writeJSON(Tweet, config.Output) writeJSON(Tweet, config.Output)
else: else:
write(output, config.Output) write(output, config.Output)
# Print output # Print output
if config.Elasticsearch: if config.Elasticsearch:
print(output, end=".", flush=True) print(output, end=".", flush=True)
else: else:
print(output) print(output)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment