gachet/word2vec_tweets_preprocessor.py

## word2vec_tweets_preprocessor.py
# -*- coding: utf-8 -*-
import json
import re
import os
import nltk

input_file = open('<JSON FILE To INPUT>','r').readlines()
for line in input_file:
    try:
        tweet = json.loads(line)
        raw_tweet_text = tweet.get('text')
        # Discard tweets with URL.
        urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_tweet_text)
        if not urls:
            # Strip User Mentions.
            tweet_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet_text).split())
            # Strip Stopwords.
            tweet_text = ' '.join([word for word in tweet_text.split().lower() if word not in stopwords])
            # Strip if tweet have less than 2 words.
            if len(tweet_text.split()) > 2:
                # Keep only len(words) >= 4.
                tweet_text = [w for w in tweet_text.split() if len(w) >= 4]
                tweet_text_str = " ".join(tweet_text)
                cleaned_tweets = cleaned_tweets + " " + tweet_text_str
    except:
        continue
input_file.close()
text_file = open("cleaned_data/<OUTPUTTXTFILE>.txt", "a")
text_file.write(cleaned_tweets)
text_file.close()
	# -- coding: utf-8 --
	import json
	import re
	import os
	import nltk

	input_file = open('<JSON FILE To INPUT>','r').readlines()
	for line in input_file:
	try:
	tweet = json.loads(line)
	raw_tweet_text = tweet.get('text')
	# Discard tweets with URL.
	urls = re.findall('http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_tweet_text)
	if not urls:
	# Strip User Mentions.
	tweet_text = ' '.join(re.sub("(@[A-Za-z0-9]+)\|([^0-9A-Za-z \t])\|(\w+:\/\/\S+)"," ", tweet_text).split())
	# Strip Stopwords.
	tweet_text = ' '.join([word for word in tweet_text.split().lower() if word not in stopwords])
	# Strip if tweet have less than 2 words.
	if len(tweet_text.split()) > 2:
	# Keep only len(words) >= 4.
	tweet_text = [w for w in tweet_text.split() if len(w) >= 4]
	tweet_text_str = " ".join(tweet_text)
	cleaned_tweets = cleaned_tweets + " " + tweet_text_str
	except:
	continue
	input_file.close()
	text_file = open("cleaned_data/<OUTPUTTXTFILE>.txt", "a")
	text_file.write(cleaned_tweets)
	text_file.close()