Commit ddb19782 authored by lmeyerov's avatar lmeyerov Committed by GitHub

Storage fixes for new format (#954)

* fix for cashtags

* typo

* fix(datetime): _formatDateTime tries %d-%m-%y

* fix(pandas): use new str-format Tweet.datetime data rep

* fix(pandas datetime): use ms

* fix(cashtags unwind): undo PRs field removals

* Revert "fix(cashtags unwind): undo PRs field removals"

This reverts commit dfa57c20186a969aa2bf010fbe198f5e0bbbbd01.

* fix(pandas): remove broken fields

* fix(cash): use provided field as suggested by pr review

* fix(datetime): Y-m-d and factored out
Co-authored-by: default avatarSiegfriedWagner <mateus.chojnowski@gmail.com>
parent b3556add
import datetime, pandas as pd, warnings
from time import strftime, localtime
import pandas as pd
import warnings
from .elasticsearch import hour
from twint.tweet import Tweet_formats
Tweets_df = None
Follow_df = None
......@@ -66,12 +66,13 @@ def update(object, config):
if _type == "tweet":
Tweet = object
day = weekdays[strftime("%A", localtime(Tweet.datetime/1000))]
datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
dt = f"{object.datestamp} {object.timestamp}"
_data = {
"id": str(Tweet.id),
"conversation_id": Tweet.conversation_id,
"created_at": Tweet.datetime,
"created_at": datetime_ms,
"date": dt,
"timezone": Tweet.timezone,
"place": Tweet.place,
......@@ -84,13 +85,13 @@ def update(object, config):
"username": Tweet.username,
"name": Tweet.name,
"day": day,
"hour": hour(Tweet.datetime/1000),
"hour": hour(datetime_ms/1000),
"link": Tweet.link,
"urls": Tweet.urls,
"photos": Tweet.photos,
"video": Tweet.video,
"thumbnail": Tweet.thumbnail,
"retweet": Tweet.retweet,
#"retweet": Tweet.retweet,
"nlikes": int(Tweet.likes_count),
"nreplies": int(Tweet.replies_count),
"nretweets": int(Tweet.retweets_count),
......@@ -99,11 +100,11 @@ def update(object, config):
"near": Tweet.near,
"geo": Tweet.geo,
"source": Tweet.source,
"user_rt_id": Tweet.user_rt_id,
"user_rt": Tweet.user_rt,
"retweet_id": Tweet.retweet_id,
#"user_rt_id": Tweet.user_rt_id,
#"user_rt": Tweet.user_rt,
#"retweet_id": Tweet.retweet_id,
"reply_to": Tweet.reply_to,
"retweet_date": Tweet.retweet_date,
#"retweet_date": Tweet.retweet_date,
"translate": Tweet.translate,
"trans_src": Tweet.trans_src,
"trans_dest": Tweet.trans_dest
......
......@@ -21,18 +21,18 @@ def tweetData(t):
"hashtags": t.hashtags,
"cashtags": t.cashtags,
"link": t.link,
"retweet": t.retweet,
# "retweet": t.retweet,
"quote_url": t.quote_url,
"video": t.video,
"thumbnail": t.thumbnail,
"near": t.near,
"geo": t.geo,
"source": t.source,
"user_rt_id": t.user_rt_id,
"user_rt": t.user_rt,
"retweet_id": t.retweet_id,
# "user_rt_id": t.user_rt_id,
# "user_rt": t.user_rt,
# "retweet_id": t.retweet_id,
"reply_to": t.reply_to,
"retweet_date": t.retweet_date,
# "retweet_date": t.retweet_date,
"translate": t.translate,
"trans_src": t.trans_src,
"trans_dest": t.trans_dest
......
......@@ -4,7 +4,6 @@ import json
import logging as logme
from googletransx import Translator
# ref.
# - https://github.com/x0rzkov/py-googletrans#basic-usage
translator = Translator()
......@@ -158,6 +157,12 @@ def getRetweet(tw, _config):
# return t
Tweet_formats = {
'datetime': '%Y-%m-%d %H:%M:%S %Z',
'datestamp': '%Y-%m-%d',
'timestamp': '%H:%M:%S'
}
def Tweet(tw, config):
"""Create Tweet object
"""
......@@ -171,10 +176,10 @@ def Tweet(tw, config):
_dt = tw['created_at']
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
_dt = utc_to_local(_dt)
t.datetime = str(_dt.strftime('%d-%m-%Y %H:%M:%S %Z'))
t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
# date is of the format year,
t.datestamp = _dt.strftime('%d-%m-%Y')
t.timestamp = _dt.strftime('%H:%M:%S')
t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
t.user_id = int(tw["user_id_str"])
t.user_id_str = tw["user_id_str"]
t.username = tw["user_data"]['screen_name']
......@@ -211,7 +216,7 @@ def Tweet(tw, config):
except KeyError:
t.hashtags = []
# don't know what this is
# t.cashtags = [cashtag.text for cashtag in tw.find_all("a", "twitter-cashtag")]
t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
t.replies_count = tw['reply_count']
t.retweets_count = tw['retweet_count']
t.likes_count = tw['favorite_count']
......
......@@ -8,6 +8,11 @@ class User:
def __init__(self):
pass
User_formats = {
'join_date': '%Y-%m-%d',
'join_time': '%H:%M:%S %Z'
}
# ur object must be a json from the endpoint https://api.twitter.com/graphql
def User(ur):
......@@ -27,8 +32,8 @@ def User(ur):
_dt = ur['data']['user']['rest_id']['legacy']['created_at']
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
# date is of the format year,
_usr.join_date = _dt.strftime('%d-%m-%Y')
_usr.join_time = _dt.strftime('%H:%M:%S %Z')
_usr.join_date = _dt.strftime(User_formats['join_date'])
_usr.join_time = _dt.strftime(User_formats['join_time'])
# :type `int`
_usr.tweets = int(ur['data']['user']['rest_id']['legacy']['statuses_count'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment