Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save gachet/2b2b43cc43e0852f2d60d82be21e5ad5 to your computer and use it in GitHub Desktop.
Save gachet/2b2b43cc43e0852f2d60d82be21e5ad5 to your computer and use it in GitHub Desktop.
Clean tweets json for tensorflow or gensim based word2vec plain text
# -*- coding: utf-8 -*-
import json
import re
import os
import nltk
input_file = open('<JSON FILE To INPUT>','r').readlines()
for line in input_file:
try:
tweet = json.loads(line)
raw_tweet_text = tweet.get('text')
# Discard tweets with URL.
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_tweet_text)
if not urls:
# Strip User Mentions.
tweet_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet_text).split())
# Strip Stopwords.
tweet_text = ' '.join([word for word in tweet_text.split().lower() if word not in stopwords])
# Strip if tweet have less than 2 words.
if len(tweet_text.split()) > 2:
# Keep only len(words) >= 4.
tweet_text = [w for w in tweet_text.split() if len(w) >= 4]
tweet_text_str = " ".join(tweet_text)
cleaned_tweets = cleaned_tweets + " " + tweet_text_str
except:
continue
input_file.close()
text_file = open("cleaned_data/<OUTPUTTXTFILE>.txt", "a")
text_file.write(cleaned_tweets)
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment