-
-
Save evaristoc/3edec1ac7cfc602d4e93 to your computer and use it in GitHub Desktop.
FreeCodeCamp/DataScience room
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
import re, math,random | |
from collections import Counter | |
from datetime import datetime, date, timedelta | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import seaborn as sb | |
sb.set() | |
import pickle | |
import pandas | |
import nltk | |
from nltk.corpus import treebank_raw as treebank | |
import statsmodels.api as sm | |
import statsmodels.formula.api as smf | |
#http://stackoverflow.com/questions/15173225/how-to-calculate-cosine-similarity-given-2-sentence-strings-python | |
def code_sub(text1): | |
import re | |
texttemp = '' | |
codefrag = text1 | |
#endsearch = -1 | |
if re.search('```',codefrag): | |
endsearch = 1 | |
place = 'Startcode' | |
else: | |
return text1 | |
while endsearch != -1: | |
#check STARTCODE | |
check = codefrag.partition('```') | |
#the STARTCODE is the only sign left: terminate | |
if len(check) == 1: | |
texttemp = texttemp + place + '. ' | |
endsearch = -1 | |
elif len(check) == 2: | |
#is STARTCODE at the begining or at the end? | |
#the begining: paste STARTCODE and continue | |
if check[0] == '```': | |
texttemp = texttemp + place | |
codefrag = check[1] | |
place = 'Endcode ' | |
#the end: all the fragment is NOT code: paste STARTCODE and finish | |
else: | |
texttemp = texttemp + check[0] + place + '. ' | |
endsearch = -1 | |
#SARTCODE is in the middle of three fragments | |
# the first fragment is NOT code, the rest should be checked | |
else: | |
texttemp = texttemp + check[0] + place | |
codefrag = check[2] | |
place = 'Endcode ' | |
#check ENDCODE | |
if place == 'Endcode ': | |
#there is a real ENDCODE... | |
if re.search('```',codefrag): | |
check = codefrag.partition('```') | |
#the only remaining sign is the ENDCODE | |
if len(check) == 1: | |
texttemp = texttemp + place | |
endsearch = -1 | |
elif len(check) == 2: | |
#is ENDCODE at the begining or at the end? | |
#the begining: paste ENDCODE and continue | |
if check[0] == '```': | |
texttemp = texttemp + place | |
codefrag = check[1] | |
place = 'Startcode' | |
#the end: all the fragment is code: paste ENDCODE and finish | |
else: | |
texttemp = texttemp + place | |
endsearch = -1 | |
else: | |
#ENDCODE is in the middle of three fragments | |
# the first fragment is code, the rest should be checked | |
texttemp = texttemp + place | |
codefrag = check[2] | |
place = 'Startcode' | |
else: | |
#we just have one wrong markdown... we don't know what follows, so paste and finish... | |
texttemp = texttemp + place + '. ' + codefrag | |
endsearch = -1 | |
#no more START- or ENDCODE | |
if not re.search('```',codefrag): | |
texttemp = texttemp + codefrag | |
endsearch = -1 | |
#print("code_sub terminated") | |
return texttemp | |
def tokens_and_boundaries(dataset): | |
tokens = [] | |
boundaries = set() | |
offset = 0 | |
for sent in dataset: | |
#print(sent) | |
#sent = re.sub(r'^w*?,','',sent) | |
tokens.extend(sent) | |
offset += len(sent) | |
boundaries.add(offset-1) | |
return tokens,boundaries | |
def punct_features(tokens, i): | |
return {'next-word-capitalized': tokens[i+1][0].isupper(), | |
'prev-word': tokens[i-1].lower(), | |
'punct': tokens[i], | |
'prev-word-is-one-char': len(tokens[i-1]) == 1} | |
def featuredsets(dataset): | |
tokens, boundaries = tokens_and_boundaries(dataset) | |
boundaries = sorted(list(boundaries)) | |
return [(punct_features(tokens, i), i in boundaries) for i in range(1, len(tokens)-1) if tokens[i] in '.?!'] | |
def segment_sentences(words): | |
start = 0 | |
sents = [] | |
for i, word in enumerate(words): | |
#print(i, word) | |
if word == '...': | |
word = '.' | |
if word in '.?!' and classifier_segsen.classify(punct_features(words, i)) == True: #1) if a punctuation mark AND words classified as TRUE | |
sents.append(words[start:i+1]) #2) add to sents all word between the start of a sentence and the end | |
start = i+1 #3) move to next sentence | |
if start < len(words): #4) if start is still less than the total paragraph.. | |
sents.append(words[start:]) #5) add all (???) | |
return sents | |
def get_cosine(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x]**2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x]**2 for x in vec2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def text_to_vector(text): | |
words = WORD.findall(text) | |
return Counter(words) | |
# similarity analyser currently only used to analyse similarity within the baq corpus... | |
def tokenizer_baqcformats(line): | |
line = re.sub(r"^\w+\b,",r"",line) # clean first word baq corpus | |
mline_tok = nltk.word_tokenize(line) | |
cc = 0 | |
for i, w in enumerate(mline_tok): | |
if i == 0: | |
mline_tok[i] = mline_tok[i]+'_'+str(cc) | |
elif re.search(punctuation1,mline_tok[i-1]): | |
cc = 0 | |
if not re.search(punctuation1,mline_tok[i]): | |
mline_tok[i] = mline_tok[i]+'_'+str(cc) | |
else: | |
mline_tok[i] = mline_tok[i] | |
else: | |
if re.search(punctuation2,mline_tok[i]): | |
cc -= 1 | |
mline_tok[i] = mline_tok[i] | |
else: | |
mline_tok[i] = mline_tok[i]+'_'+str(cc) | |
cc += 1 | |
return line, mline_tok, text_to_vector(line), text_to_vector(' '.join(mline_tok)), get_cosine(text_to_vector(line), text_to_vector(' '.join(mline_tok))) | |
directory = "/DATA/DIRECTORY/" | |
raw = pickle.load(open(directory+"help.pkl", "rb")) | |
#re COMPILERS | |
WORD = re.compile(r'\w+') | |
punctuation1 = re.compile('^(,|;|:|!|\?|\.+?|but|and|or|-)$', re.IGNORECASE) | |
punctuation2 = re.compile("^(,|;|:|!|\?|\.+?|but|and|or|\d|\|+?|\s|[-#$%&)(*+/>=<@\\_}{~]+?)$", re.IGNORECASE) | |
#http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python | |
link = re.compile(r'\s(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?\s', re.IGNORECASE) | |
#not working: code = re.compile(r'```.*```', re.IGNORECASE) | |
#TRAINING SET FOR SENTENCE SEGMENTATION | |
train_sentsegset = treebank.sents() | |
train_featsentsegset = featuredsets(train_sentsegset) | |
classifier_segsen = nltk.NaiveBayesClassifier.train(train_featsentsegset) | |
#CALLING AND PREPARING baq CORPUS | |
test = 0 | |
baqlines = [] | |
mbaqtoks = [] | |
vectors_baqlines = [] | |
vectors_mbaqtoks = [] | |
cosines = [] | |
with open(directory+'baqCorpus/baqCorpus.txt', 'r') as baq: | |
line = baq.readline() | |
while line: | |
line, mline_tok, vecline, vecmline_tok, simcosine = tokenizer_baqcformats(line) | |
baqlines.append(line) | |
mbaqtoks.append(mline_tok) | |
vectors_baqlines.append(vecline) | |
vectors_mbaqtoks.append(vecmline_tok) | |
cosines.append(simcosine) | |
test += 1 | |
line = baq.readline() | |
raw = pickle.load(open(directory+"help.pkl", "rb")) | |
users = set() | |
for u in raw: | |
if u['fromUser']['username'] not in users: | |
users.add(u['fromUser']['username']) | |
print("USER DATA STARTED...") | |
datausers = {} | |
usertimes = {} | |
#http://stackoverflow.com/questions/2788871/python-date-difference-in-minutes | |
fmt = '%Y-%m-%dT%H:%M' | |
users = {'AdventureBear'} | |
count_du = 0 | |
for username in users: | |
#CAPTURING TEXTS AND SCORES PER USER | |
# initialization of username an data collector (messages list) | |
if username not in list(datausers.keys()): | |
datausers[username] = [] | |
messages = [] | |
for i, elem in enumerate(raw): | |
# loop over all messages and if it is from username, capture that text and time | |
if elem['fromUser']['username'] == username: | |
text1 = elem['text'] | |
text1 = re.sub(link, ' Alinktoapage ', text1) | |
#finding code... | |
text1 = code_sub(text1) | |
#print(text1) | |
if WORD.search(text1) == None: | |
text1 = 'Blankorcharacters ' | |
texttime = datetime.strptime(elem['sent'][:16],fmt) | |
# prepare text for calculations and initialise comparisons | |
vector1 = text_to_vector(text1) | |
count = 0 | |
cosine = 0 | |
starttime = texttime | |
for el in raw[i:]: | |
# have a look of this and following messages sent by username until there is a gap higher of 5 minutes between the current message and the last one | |
# (look if the person has been "on" for 5 minutes since this message) | |
if el['fromUser']['username'] == username: | |
followingtime = datetime.strptime(el['sent'][:16],fmt) | |
if followingtime - starttime > timedelta(minutes=5): | |
break | |
text2 = el['text'] | |
if WORD.search(text2) == None: | |
text2 = 'Blankorcharacters ' | |
#similarity scoring: cosine | |
vector2 = text_to_vector(text2) | |
cosine += get_cosine(vector1, vector2) | |
count += 1 | |
# average all scores | |
if count > 0: cosine = cosine/count | |
text1_id = elem['id'] | |
# to append to messages list: the current text and id, the last text in the 5 minutes window, the time at which this text was sent, the score | |
msg = (text1, text1_id, text2, elem['sent'][:16], cosine) | |
messages.append(msg) | |
#CAPTURING ON-OFF CHAT PERIODS | |
if username not in list(usertimes.keys()): | |
usertimes[username] = [] | |
if len(usertimes[username]) == 0: | |
#first message sent: "on" message | |
usertimes[username].append([elem['sent'][:16]]) | |
find_end = cosine | |
# the person STILL in the chat room | |
elif find_end <= .99: | |
if cosine > .99: | |
#if person "off" of chat room, attach to "on" message and set long activity to stop (find_end = 1) | |
usertimes[username][-1].append(elem['sent'][:16]) | |
find_end = 1 | |
# it is a request to start chatting | |
else: | |
# last first "on": attach to collection as first | |
usertimes[username].append([elem['sent'][:16]]) | |
# if a long "on" activity, set to long activity: find_end = 0 | |
if cosine <= .99: | |
find_end = 0 | |
#once completed to visit all messages of THAT person, attach to datausers | |
datausers[username] = messages | |
print("USER DATA COMPLETED...") | |
similarity_analysis = [] | |
count_sa = 0 | |
#originally sample_prop = .10 | |
sample_prop = .10 | |
print("SIM ANALYSIS DATA STARTED...") | |
for records in datausers['AdventureBear']: | |
if random.random() > sample_prop: continue | |
txt = records[0] | |
if len(txt) == 1: continue | |
txtid = records[1] | |
tar_sentence_tok = nltk.word_tokenize(txt) | |
tar_sentence_tok = tar_sentence_tok + ['END.'] | |
for sen in segment_sentences(tar_sentence_tok): | |
#take the 'END.' out HERE!! | |
if sen[-1] == 'END.': sen = sen[:-1] | |
if sen == []: continue | |
senstring = ' '.join(sen) | |
vecsenstring = text_to_vector(senstring) | |
baq_sim = [] | |
mod_baq_sim = [] | |
for i, vbaq in enumerate(vectors_baqlines): | |
baq_sim.append(get_cosine(vecsenstring, vbaq)) | |
#mod_baq_sim.append(get_cosine(tokenizer_baqcformats(vectxt)[3], vectors_mbaqtoks[i])) | |
mod_baq_sim.append(get_cosine(tokenizer_baqcformats(senstring)[3], vectors_mbaqtoks[i])) | |
baq_cosine = sum(baq_sim)/len(baqlines) | |
mod_baq_cosine = sum(mod_baq_sim)/len(baqlines) | |
similarity_analysis.append([txt, txtid, senstring, baq_cosine, mod_baq_cosine, records[4]]) | |
#count_sa += 1 | |
#if count_sa > 100: break | |
print("SIM ANALYSIS DATA COMPLETED...") | |
pndfile = pandas.DataFrame(similarity_analysis, columns=['txt', 'txtid', 'sen', 'baqc', 'mbaqc', 'posc']) | |
g = sb.pairplot(pndfile, kind="reg") | |
plt.show() | |
g2 = sb.lmplot('baqc', 'mbaqc', pndfile) | |
result1 = smf.ols('mbaqc ~ baqc', data=pndfile).fit() | |
result2 = smf.ols('baqc ~ mbaqc', data=pndfile).fit() | |
#http://stackoverflow.com/questions/11869910/pandas-filter-rows-of-dataframe-with-operator-chaining | |
pndfile.query("baqc <= "+str(result2.params.Intercept) + "+" + str(result2.params.mbaqc) + "*mbaqc and mbaqc <= " + str(result1.params.Intercept) + "+" + str(result1.params.baqc) + "*baqc") | |
pndfile['regions'] = -1 | |
pndfile.loc[(pndfile.mbaqc <= result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc <= result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 0 | |
pndfile.loc[(pndfile.mbaqc <= result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc > result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 1 | |
pndfile.loc[(pndfile.mbaqc > result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc > result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 2 | |
pndfile.loc[(pndfile.mbaqc > result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc <= result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 3 | |
plt.scatter(pndfile.baqc, pndfile.mbaqc, c=pndfile.regions) | |
plt.show() | |
pickle.dump(similarity_analysis, open(directory+'tests_chatan/sens_an.pkl', 'wb')) | |
pickle.dump(pndfile, open(directory+'tests_chatan/pandas_sens_an.pkl', 'wb')) | |
pndfile.to_csv(directory+'tests_chatan/pandas_sens_an.csv', sep='\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The main functionalities are:
To prepare the data for this project, the following was done: