Skip to content

Instantly share code, notes, and snippets.

@duydo
Created November 26, 2014 08:28
Show Gist options
  • Save duydo/c0fa54425cc497303217 to your computer and use it in GitHub Desktop.
Save duydo/c0fa54425cc497303217 to your computer and use it in GitHub Desktop.
Merge tweet
__author__ = 'duydo'
import re
from dateutil.parser import parse
RT_PATTERN = r'(RT|MT|retweet|from|via)((?:\b\W*@\w+)+)(:*)'
RT_REGEX = re.compile(RT_PATTERN, re.UNICODE | re.IGNORECASE)
def extract_from_raw_tweet(rt, extract_user=False):
extracted = dict(
id=rt.get('id_str'),
published_at=parse(rt.get('created_at')),
urls=[dict(url=e.get('url'), expanded_url=e.get('expanded_url')) for e in rt.get('urls')],
user_mentions=[dict(id=e.get('id_str'), screen_name=e.get('screen_name')) for e in rt.get('user_mentions')],
favorite_count=rt.get('count'),
in_reply_to_screen_name=rt.get('in_reply_to_screen_name'),
in_reply_to_status_id=rt.get('in_reply_to_status_id_str'),
in_reply_to_user_id=rt.get('in_reply_to_user_id_str'),
retweeted=False,
retweeted_rt=False,
orignal=True,
reply=True if rt.get('in_reply_to_user_id_str') else False
)
if extract_user:
extracted['user'] = dict(
id=rt.get('user').get('id_str'),
created_at=parse(rt.get('user').get('created_at')),
screen_name=rt.get('user').get('id_str')
)
if rt.get('retweeted_status'):
extracted['retweeted_status'] = extract_from_raw_tweet(rt.get('retweeted_status'), True)
extracted['retweeted'] = True
extracted['orignal'] = False
else:
if RT_REGEX.match(rt.get('text')):
extracted['retweeted_rt'] = True
extracted['orignal'] = False
return extracted
def extract_from_relevant_tweet(t):
return dict(
# Fields from relevant doc
db_id=t.get('_id'),
dup_id=t.get('dup_id'),
# created_at=t.get('date') or t.get('created_at'),
channel=t.get('channel'),
publisher_id=t.get('publisher', {}).get('_id'),
publisher_twitter_user_id=t.get('publisher', {}).get('_id'),
sempattern=t.get('sempattern'),
tag=t.get('tag'),
content=t.get('text'),
timestamp=t.get('timestamp'),
uri=t.get('uri'),
)
def merge_tweet(t, r):
extracted_raw = extract_from_raw_tweet(r)
extracted_raw.update(extract_from_relevant_tweet(t))
return extracted_raw
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment