Skip to content

Instantly share code, notes, and snippets.

@monajalal
Created October 31, 2016 06:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monajalal/fc2f86736833d4525f38abe5832c38ac to your computer and use it in GitHub Desktop.
Save monajalal/fc2f86736833d4525f38abe5832c38ac to your computer and use it in GitHub Desktop.
import gensim
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import logging
import re
import itertools
import glob
from collections import defaultdict
import csv
from nltk.stem.wordnet import WordNetLemmatizer
import os
import os.path
stopwords = nltk.corpus.stopwords.words('english')
path = "/home/mona/computer_vision/imgur/tiny_comments/*.txt"
files = glob.glob(path)
csv_file_complete = open("tiny_graph.csv", "wb")
stat_csv_file = open("tiny_stat.csv", "r")
csv_reader = csv.reader(stat_csv_file)
lemmatizer = WordNetLemmatizer()
list_of_rows = []
with open('swear_words_uniq.txt') as swear_words_file:
swear_words = swear_words_file.read()
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split()
swear_words_file.close()
for file1, file2 in itertools.combinations(files, 2):
with open(file1) as f1:
f1_text = f1.read()
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE)
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split()
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords]
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2]
f1.close()
with open(file2) as f2:
f2_text = f2.read()
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split()
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords]
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2]
f2.close()
f1_head, f1_tail = os.path.split(file1)
f2_head, f2_tail = os.path.split(file2)
tail_to_numbers = {ftail: fnum for fnum, ftail in csv_reader}
stat_csv_file.seek(0)
try:
file1_file_number = tail_to_numbers[f1_tail]
file2_file_number = tail_to_numbers[f2_tail]
except KeyError as e:
print(e)
continue
else:
row_complete = [file1_file_number.strip(), file2_file_number.strip()]
list_of_rows.append(row_complete)
print(len(list_of_rows))
a_complete = csv.writer(csv_file_complete, delimiter=',')
for row in list_of_rows:
print(row)
a_complete.writerow(row)
csv_file_complete.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment