Commit ddb19782 authored by lmeyerov's avatar lmeyerov Committed by GitHub

Storage fixes for new format (#954)

* fix for cashtags

* typo

* fix(datetime): _formatDateTime tries %d-%m-%y

* fix(pandas): use new str-format Tweet.datetime data rep

* fix(pandas datetime): use ms

* fix(cashtags unwind): undo PRs field removals

* Revert "fix(cashtags unwind): undo PRs field removals"

This reverts commit dfa57c20186a969aa2bf010fbe198f5e0bbbbd01.

* fix(pandas): remove broken fields

* fix(cash): use provided field as suggested by pr review

* fix(datetime): Y-m-d and factored out
Co-authored-by: default avatarSiegfriedWagner <mateus.chojnowski@gmail.com>
parent b3556add
import datetime, pandas as pd, warnings
from time import strftime, localtime from time import strftime, localtime
import pandas as pd
import warnings
from .elasticsearch import hour from .elasticsearch import hour
from twint.tweet import Tweet_formats
Tweets_df = None Tweets_df = None
Follow_df = None Follow_df = None
...@@ -66,12 +66,13 @@ def update(object, config): ...@@ -66,12 +66,13 @@ def update(object, config):
if _type == "tweet": if _type == "tweet":
Tweet = object Tweet = object
day = weekdays[strftime("%A", localtime(Tweet.datetime/1000))] datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
dt = f"{object.datestamp} {object.timestamp}" dt = f"{object.datestamp} {object.timestamp}"
_data = { _data = {
"id": str(Tweet.id), "id": str(Tweet.id),
"conversation_id": Tweet.conversation_id, "conversation_id": Tweet.conversation_id,
"created_at": Tweet.datetime, "created_at": datetime_ms,
"date": dt, "date": dt,
"timezone": Tweet.timezone, "timezone": Tweet.timezone,
"place": Tweet.place, "place": Tweet.place,
...@@ -84,13 +85,13 @@ def update(object, config): ...@@ -84,13 +85,13 @@ def update(object, config):
"username": Tweet.username, "username": Tweet.username,
"name": Tweet.name, "name": Tweet.name,
"day": day, "day": day,
"hour": hour(Tweet.datetime/1000), "hour": hour(datetime_ms/1000),
"link": Tweet.link, "link": Tweet.link,
"urls": Tweet.urls, "urls": Tweet.urls,
"photos": Tweet.photos, "photos": Tweet.photos,
"video": Tweet.video, "video": Tweet.video,
"thumbnail": Tweet.thumbnail, "thumbnail": Tweet.thumbnail,
"retweet": Tweet.retweet, #"retweet": Tweet.retweet,
"nlikes": int(Tweet.likes_count), "nlikes": int(Tweet.likes_count),
"nreplies": int(Tweet.replies_count), "nreplies": int(Tweet.replies_count),
"nretweets": int(Tweet.retweets_count), "nretweets": int(Tweet.retweets_count),
...@@ -99,11 +100,11 @@ def update(object, config): ...@@ -99,11 +100,11 @@ def update(object, config):
"near": Tweet.near, "near": Tweet.near,
"geo": Tweet.geo, "geo": Tweet.geo,
"source": Tweet.source, "source": Tweet.source,
"user_rt_id": Tweet.user_rt_id, #"user_rt_id": Tweet.user_rt_id,
"user_rt": Tweet.user_rt, #"user_rt": Tweet.user_rt,
"retweet_id": Tweet.retweet_id, #"retweet_id": Tweet.retweet_id,
"reply_to": Tweet.reply_to, "reply_to": Tweet.reply_to,
"retweet_date": Tweet.retweet_date, #"retweet_date": Tweet.retweet_date,
"translate": Tweet.translate, "translate": Tweet.translate,
"trans_src": Tweet.trans_src, "trans_src": Tweet.trans_src,
"trans_dest": Tweet.trans_dest "trans_dest": Tweet.trans_dest
......
...@@ -21,18 +21,18 @@ def tweetData(t): ...@@ -21,18 +21,18 @@ def tweetData(t):
"hashtags": t.hashtags, "hashtags": t.hashtags,
"cashtags": t.cashtags, "cashtags": t.cashtags,
"link": t.link, "link": t.link,
"retweet": t.retweet, # "retweet": t.retweet,
"quote_url": t.quote_url, "quote_url": t.quote_url,
"video": t.video, "video": t.video,
"thumbnail": t.thumbnail, "thumbnail": t.thumbnail,
"near": t.near, "near": t.near,
"geo": t.geo, "geo": t.geo,
"source": t.source, "source": t.source,
"user_rt_id": t.user_rt_id, # "user_rt_id": t.user_rt_id,
"user_rt": t.user_rt, # "user_rt": t.user_rt,
"retweet_id": t.retweet_id, # "retweet_id": t.retweet_id,
"reply_to": t.reply_to, "reply_to": t.reply_to,
"retweet_date": t.retweet_date, # "retweet_date": t.retweet_date,
"translate": t.translate, "translate": t.translate,
"trans_src": t.trans_src, "trans_src": t.trans_src,
"trans_dest": t.trans_dest "trans_dest": t.trans_dest
......
...@@ -4,7 +4,6 @@ import json ...@@ -4,7 +4,6 @@ import json
import logging as logme import logging as logme
from googletransx import Translator from googletransx import Translator
# ref. # ref.
# - https://github.com/x0rzkov/py-googletrans#basic-usage # - https://github.com/x0rzkov/py-googletrans#basic-usage
translator = Translator() translator = Translator()
...@@ -158,6 +157,12 @@ def getRetweet(tw, _config): ...@@ -158,6 +157,12 @@ def getRetweet(tw, _config):
# return t # return t
Tweet_formats = {
'datetime': '%Y-%m-%d %H:%M:%S %Z',
'datestamp': '%Y-%m-%d',
'timestamp': '%H:%M:%S'
}
def Tweet(tw, config): def Tweet(tw, config):
"""Create Tweet object """Create Tweet object
""" """
...@@ -171,10 +176,10 @@ def Tweet(tw, config): ...@@ -171,10 +176,10 @@ def Tweet(tw, config):
_dt = tw['created_at'] _dt = tw['created_at']
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
_dt = utc_to_local(_dt) _dt = utc_to_local(_dt)
t.datetime = str(_dt.strftime('%d-%m-%Y %H:%M:%S %Z')) t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
# date is of the format year, # date is of the format year,
t.datestamp = _dt.strftime('%d-%m-%Y') t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
t.timestamp = _dt.strftime('%H:%M:%S') t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
t.user_id = int(tw["user_id_str"]) t.user_id = int(tw["user_id_str"])
t.user_id_str = tw["user_id_str"] t.user_id_str = tw["user_id_str"]
t.username = tw["user_data"]['screen_name'] t.username = tw["user_data"]['screen_name']
...@@ -211,7 +216,7 @@ def Tweet(tw, config): ...@@ -211,7 +216,7 @@ def Tweet(tw, config):
except KeyError: except KeyError:
t.hashtags = [] t.hashtags = []
# don't know what this is # don't know what this is
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")] t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
t.replies_count = tw['reply_count'] t.replies_count = tw['reply_count']
t.retweets_count = tw['retweet_count'] t.retweets_count = tw['retweet_count']
t.likes_count = tw['favorite_count'] t.likes_count = tw['favorite_count']
......
...@@ -8,6 +8,11 @@ class User: ...@@ -8,6 +8,11 @@ class User:
def __init__(self): def __init__(self):
pass pass
User_formats = {
'join_date': '%Y-%m-%d',
'join_time': '%H:%M:%S %Z'
}
# ur object must be a json from the endpoint https://api.twitter.com/graphql # ur object must be a json from the endpoint https://api.twitter.com/graphql
def User(ur): def User(ur):
...@@ -27,8 +32,8 @@ def User(ur): ...@@ -27,8 +32,8 @@ def User(ur):
_dt = ur['data']['user']['rest_id']['legacy']['created_at'] _dt = ur['data']['user']['rest_id']['legacy']['created_at']
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
# date is of the format year, # date is of the format year,
_usr.join_date = _dt.strftime('%d-%m-%Y') _usr.join_date = _dt.strftime(User_formats['join_date'])
_usr.join_time = _dt.strftime('%H:%M:%S %Z') _usr.join_time = _dt.strftime(User_formats['join_time'])
# :type `int` # :type `int`
_usr.tweets = int(ur['data']['user']['rest_id']['legacy']['statuses_count']) _usr.tweets = int(ur['data']['user']['rest_id']['legacy']['statuses_count'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment