Skip to content

Instantly share code, notes, and snippets.

@evaristoc
Created October 18, 2015 13:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save evaristoc/3edec1ac7cfc602d4e93 to your computer and use it in GitHub Desktop.
Save evaristoc/3edec1ac7cfc602d4e93 to your computer and use it in GitHub Desktop.
FreeCodeCamp/DataScience room
import os, sys
import re, math,random
from collections import Counter
from datetime import datetime, date, timedelta
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()
import pickle
import pandas
import nltk
from nltk.corpus import treebank_raw as treebank
import statsmodels.api as sm
import statsmodels.formula.api as smf
#http://stackoverflow.com/questions/15173225/how-to-calculate-cosine-similarity-given-2-sentence-strings-python
def code_sub(text1):
import re
texttemp = ''
codefrag = text1
#endsearch = -1
if re.search('```',codefrag):
endsearch = 1
place = 'Startcode'
else:
return text1
while endsearch != -1:
#check STARTCODE
check = codefrag.partition('```')
#the STARTCODE is the only sign left: terminate
if len(check) == 1:
texttemp = texttemp + place + '. '
endsearch = -1
elif len(check) == 2:
#is STARTCODE at the begining or at the end?
#the begining: paste STARTCODE and continue
if check[0] == '```':
texttemp = texttemp + place
codefrag = check[1]
place = 'Endcode '
#the end: all the fragment is NOT code: paste STARTCODE and finish
else:
texttemp = texttemp + check[0] + place + '. '
endsearch = -1
#SARTCODE is in the middle of three fragments
# the first fragment is NOT code, the rest should be checked
else:
texttemp = texttemp + check[0] + place
codefrag = check[2]
place = 'Endcode '
#check ENDCODE
if place == 'Endcode ':
#there is a real ENDCODE...
if re.search('```',codefrag):
check = codefrag.partition('```')
#the only remaining sign is the ENDCODE
if len(check) == 1:
texttemp = texttemp + place
endsearch = -1
elif len(check) == 2:
#is ENDCODE at the begining or at the end?
#the begining: paste ENDCODE and continue
if check[0] == '```':
texttemp = texttemp + place
codefrag = check[1]
place = 'Startcode'
#the end: all the fragment is code: paste ENDCODE and finish
else:
texttemp = texttemp + place
endsearch = -1
else:
#ENDCODE is in the middle of three fragments
# the first fragment is code, the rest should be checked
texttemp = texttemp + place
codefrag = check[2]
place = 'Startcode'
else:
#we just have one wrong markdown... we don't know what follows, so paste and finish...
texttemp = texttemp + place + '. ' + codefrag
endsearch = -1
#no more START- or ENDCODE
if not re.search('```',codefrag):
texttemp = texttemp + codefrag
endsearch = -1
#print("code_sub terminated")
return texttemp
def tokens_and_boundaries(dataset):
tokens = []
boundaries = set()
offset = 0
for sent in dataset:
#print(sent)
#sent = re.sub(r'^w*?,','',sent)
tokens.extend(sent)
offset += len(sent)
boundaries.add(offset-1)
return tokens,boundaries
def punct_features(tokens, i):
return {'next-word-capitalized': tokens[i+1][0].isupper(),
'prev-word': tokens[i-1].lower(),
'punct': tokens[i],
'prev-word-is-one-char': len(tokens[i-1]) == 1}
def featuredsets(dataset):
tokens, boundaries = tokens_and_boundaries(dataset)
boundaries = sorted(list(boundaries))
return [(punct_features(tokens, i), i in boundaries) for i in range(1, len(tokens)-1) if tokens[i] in '.?!']
def segment_sentences(words):
start = 0
sents = []
for i, word in enumerate(words):
#print(i, word)
if word == '...':
word = '.'
if word in '.?!' and classifier_segsen.classify(punct_features(words, i)) == True: #1) if a punctuation mark AND words classified as TRUE
sents.append(words[start:i+1]) #2) add to sents all word between the start of a sentence and the end
start = i+1 #3) move to next sentence
if start < len(words): #4) if start is still less than the total paragraph..
sents.append(words[start:]) #5) add all (???)
return sents
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# similarity analyser currently only used to analyse similarity within the baq corpus...
def tokenizer_baqcformats(line):
line = re.sub(r"^\w+\b,",r"",line) # clean first word baq corpus
mline_tok = nltk.word_tokenize(line)
cc = 0
for i, w in enumerate(mline_tok):
if i == 0:
mline_tok[i] = mline_tok[i]+'_'+str(cc)
elif re.search(punctuation1,mline_tok[i-1]):
cc = 0
if not re.search(punctuation1,mline_tok[i]):
mline_tok[i] = mline_tok[i]+'_'+str(cc)
else:
mline_tok[i] = mline_tok[i]
else:
if re.search(punctuation2,mline_tok[i]):
cc -= 1
mline_tok[i] = mline_tok[i]
else:
mline_tok[i] = mline_tok[i]+'_'+str(cc)
cc += 1
return line, mline_tok, text_to_vector(line), text_to_vector(' '.join(mline_tok)), get_cosine(text_to_vector(line), text_to_vector(' '.join(mline_tok)))
directory = "/DATA/DIRECTORY/"
raw = pickle.load(open(directory+"help.pkl", "rb"))
#re COMPILERS
WORD = re.compile(r'\w+')
punctuation1 = re.compile('^(,|;|:|!|\?|\.+?|but|and|or|-)$', re.IGNORECASE)
punctuation2 = re.compile("^(,|;|:|!|\?|\.+?|but|and|or|\d|\|+?|\s|[-#$%&)(*+/>=<@\\_}{~]+?)$", re.IGNORECASE)
#http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
link = re.compile(r'\s(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]+-?)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/[^\s]*)?\s', re.IGNORECASE)
#not working: code = re.compile(r'```.*```', re.IGNORECASE)
#TRAINING SET FOR SENTENCE SEGMENTATION
train_sentsegset = treebank.sents()
train_featsentsegset = featuredsets(train_sentsegset)
classifier_segsen = nltk.NaiveBayesClassifier.train(train_featsentsegset)
#CALLING AND PREPARING baq CORPUS
test = 0
baqlines = []
mbaqtoks = []
vectors_baqlines = []
vectors_mbaqtoks = []
cosines = []
with open(directory+'baqCorpus/baqCorpus.txt', 'r') as baq:
line = baq.readline()
while line:
line, mline_tok, vecline, vecmline_tok, simcosine = tokenizer_baqcformats(line)
baqlines.append(line)
mbaqtoks.append(mline_tok)
vectors_baqlines.append(vecline)
vectors_mbaqtoks.append(vecmline_tok)
cosines.append(simcosine)
test += 1
line = baq.readline()
raw = pickle.load(open(directory+"help.pkl", "rb"))
users = set()
for u in raw:
if u['fromUser']['username'] not in users:
users.add(u['fromUser']['username'])
print("USER DATA STARTED...")
datausers = {}
usertimes = {}
#http://stackoverflow.com/questions/2788871/python-date-difference-in-minutes
fmt = '%Y-%m-%dT%H:%M'
users = {'AdventureBear'}
count_du = 0
for username in users:
#CAPTURING TEXTS AND SCORES PER USER
# initialization of username an data collector (messages list)
if username not in list(datausers.keys()):
datausers[username] = []
messages = []
for i, elem in enumerate(raw):
# loop over all messages and if it is from username, capture that text and time
if elem['fromUser']['username'] == username:
text1 = elem['text']
text1 = re.sub(link, ' Alinktoapage ', text1)
#finding code...
text1 = code_sub(text1)
#print(text1)
if WORD.search(text1) == None:
text1 = 'Blankorcharacters '
texttime = datetime.strptime(elem['sent'][:16],fmt)
# prepare text for calculations and initialise comparisons
vector1 = text_to_vector(text1)
count = 0
cosine = 0
starttime = texttime
for el in raw[i:]:
# have a look of this and following messages sent by username until there is a gap higher of 5 minutes between the current message and the last one
# (look if the person has been "on" for 5 minutes since this message)
if el['fromUser']['username'] == username:
followingtime = datetime.strptime(el['sent'][:16],fmt)
if followingtime - starttime > timedelta(minutes=5):
break
text2 = el['text']
if WORD.search(text2) == None:
text2 = 'Blankorcharacters '
#similarity scoring: cosine
vector2 = text_to_vector(text2)
cosine += get_cosine(vector1, vector2)
count += 1
# average all scores
if count > 0: cosine = cosine/count
text1_id = elem['id']
# to append to messages list: the current text and id, the last text in the 5 minutes window, the time at which this text was sent, the score
msg = (text1, text1_id, text2, elem['sent'][:16], cosine)
messages.append(msg)
#CAPTURING ON-OFF CHAT PERIODS
if username not in list(usertimes.keys()):
usertimes[username] = []
if len(usertimes[username]) == 0:
#first message sent: "on" message
usertimes[username].append([elem['sent'][:16]])
find_end = cosine
# the person STILL in the chat room
elif find_end <= .99:
if cosine > .99:
#if person "off" of chat room, attach to "on" message and set long activity to stop (find_end = 1)
usertimes[username][-1].append(elem['sent'][:16])
find_end = 1
# it is a request to start chatting
else:
# last first "on": attach to collection as first
usertimes[username].append([elem['sent'][:16]])
# if a long "on" activity, set to long activity: find_end = 0
if cosine <= .99:
find_end = 0
#once completed to visit all messages of THAT person, attach to datausers
datausers[username] = messages
print("USER DATA COMPLETED...")
similarity_analysis = []
count_sa = 0
#originally sample_prop = .10
sample_prop = .10
print("SIM ANALYSIS DATA STARTED...")
for records in datausers['AdventureBear']:
if random.random() > sample_prop: continue
txt = records[0]
if len(txt) == 1: continue
txtid = records[1]
tar_sentence_tok = nltk.word_tokenize(txt)
tar_sentence_tok = tar_sentence_tok + ['END.']
for sen in segment_sentences(tar_sentence_tok):
#take the 'END.' out HERE!!
if sen[-1] == 'END.': sen = sen[:-1]
if sen == []: continue
senstring = ' '.join(sen)
vecsenstring = text_to_vector(senstring)
baq_sim = []
mod_baq_sim = []
for i, vbaq in enumerate(vectors_baqlines):
baq_sim.append(get_cosine(vecsenstring, vbaq))
#mod_baq_sim.append(get_cosine(tokenizer_baqcformats(vectxt)[3], vectors_mbaqtoks[i]))
mod_baq_sim.append(get_cosine(tokenizer_baqcformats(senstring)[3], vectors_mbaqtoks[i]))
baq_cosine = sum(baq_sim)/len(baqlines)
mod_baq_cosine = sum(mod_baq_sim)/len(baqlines)
similarity_analysis.append([txt, txtid, senstring, baq_cosine, mod_baq_cosine, records[4]])
#count_sa += 1
#if count_sa > 100: break
print("SIM ANALYSIS DATA COMPLETED...")
pndfile = pandas.DataFrame(similarity_analysis, columns=['txt', 'txtid', 'sen', 'baqc', 'mbaqc', 'posc'])
g = sb.pairplot(pndfile, kind="reg")
plt.show()
g2 = sb.lmplot('baqc', 'mbaqc', pndfile)
result1 = smf.ols('mbaqc ~ baqc', data=pndfile).fit()
result2 = smf.ols('baqc ~ mbaqc', data=pndfile).fit()
#http://stackoverflow.com/questions/11869910/pandas-filter-rows-of-dataframe-with-operator-chaining
pndfile.query("baqc <= "+str(result2.params.Intercept) + "+" + str(result2.params.mbaqc) + "*mbaqc and mbaqc <= " + str(result1.params.Intercept) + "+" + str(result1.params.baqc) + "*baqc")
pndfile['regions'] = -1
pndfile.loc[(pndfile.mbaqc <= result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc <= result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 0
pndfile.loc[(pndfile.mbaqc <= result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc > result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 1
pndfile.loc[(pndfile.mbaqc > result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc > result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 2
pndfile.loc[(pndfile.mbaqc > result1.params.Intercept + result1.params.baqc*pndfile.baqc) & (pndfile.baqc <= result2.params.Intercept + result2.params.mbaqc*pndfile.mbaqc), ['regions']] = 3
plt.scatter(pndfile.baqc, pndfile.mbaqc, c=pndfile.regions)
plt.show()
pickle.dump(similarity_analysis, open(directory+'tests_chatan/sens_an.pkl', 'wb'))
pickle.dump(pndfile, open(directory+'tests_chatan/pandas_sens_an.pkl', 'wb'))
pndfile.to_csv(directory+'tests_chatan/pandas_sens_an.csv', sep='\t')
@evaristoc
Copy link
Author

The main functionalities are:

  • Data cleaning, focusing on cleaning fragments like code, punctuations, stopwords, etc
  • Tokenization (nltk word_tokenize module; nltk tutorial)
  • Sentence Segmentation (using the nltk treebank classifier)
  • Word order analysis (attached a number to the word according to its position in a sentence after considering punctuation marks or joining words (but, and, etc))
  • Cosine Similarity scoring

To prepare the data for this project, the following was done:

  • All the messages by the camper selected camper (AdventureBear, with authorization) were extracted from the whole pool of messages during Jan-Jul 2015.
  • The code includes tagging of links and code sections, and also a positional analysis (not used for this project)
  • The pre-processing also included tokenization, sentence segmentation, and discharge of punctuation marks and stopwords
  • Then a SIM ANALYSIS (for similarity analysis) was carried out

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment