Created
October 31, 2016 06:19
-
-
Save monajalal/fc2f86736833d4525f38abe5832c38ac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
import nltk | |
from gensim.models import word2vec | |
from nltk.corpus import stopwords | |
from nltk.corpus import wordnet | |
import logging | |
import re | |
import itertools | |
import glob | |
from collections import defaultdict | |
import csv | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import os | |
import os.path | |
stopwords = nltk.corpus.stopwords.words('english') | |
path = "/home/mona/computer_vision/imgur/tiny_comments/*.txt" | |
files = glob.glob(path) | |
csv_file_complete = open("tiny_graph.csv", "wb") | |
stat_csv_file = open("tiny_stat.csv", "r") | |
csv_reader = csv.reader(stat_csv_file) | |
lemmatizer = WordNetLemmatizer() | |
list_of_rows = [] | |
with open('swear_words_uniq.txt') as swear_words_file: | |
swear_words = swear_words_file.read() | |
swear_words = re.sub("[^a-zA-Z]", ' ', swear_words).lower().split() | |
swear_words_file.close() | |
for file1, file2 in itertools.combinations(files, 2): | |
with open(file1) as f1: | |
f1_text = f1.read() | |
f1_text = re.sub(r'^https?:\/\/.*[\r\n]*', '',f1_text, flags=re.MULTILINE) | |
f1_words = re.sub("[^a-zA-Z]", ' ', f1_text).lower().split() | |
lemmatized_f1_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f1_words if w not in stopwords] | |
cleaned_f1_words = [w for w in lemmatized_f1_words if w not in swear_words and len(w) > 2] | |
f1.close() | |
with open(file2) as f2: | |
f2_text = f2.read() | |
f2_words = re.sub("[^a-zA-Z]", ' ', f2_text).lower().split() | |
lemmatized_f2_words = [str(lemmatizer.lemmatize(w, wordnet.VERB)) for w in f2_words if w not in stopwords] | |
cleaned_f2_words = [w for w in lemmatized_f2_words if w not in swear_words and len(w) > 2] | |
f2.close() | |
f1_head, f1_tail = os.path.split(file1) | |
f2_head, f2_tail = os.path.split(file2) | |
tail_to_numbers = {ftail: fnum for fnum, ftail in csv_reader} | |
stat_csv_file.seek(0) | |
try: | |
file1_file_number = tail_to_numbers[f1_tail] | |
file2_file_number = tail_to_numbers[f2_tail] | |
except KeyError as e: | |
print(e) | |
continue | |
else: | |
row_complete = [file1_file_number.strip(), file2_file_number.strip()] | |
list_of_rows.append(row_complete) | |
print(len(list_of_rows)) | |
a_complete = csv.writer(csv_file_complete, delimiter=',') | |
for row in list_of_rows: | |
print(row) | |
a_complete.writerow(row) | |
csv_file_complete.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment