Skip to content

Instantly share code, notes, and snippets.

@shreya-singh-tech
Last active August 15, 2021 03:57
Show Gist options
  • Save shreya-singh-tech/ac9fe2e5445dff0ec81f818f051d840f to your computer and use it in GitHub Desktop.
Save shreya-singh-tech/ac9fe2e5445dff0ec81f818f051d840f to your computer and use it in GitHub Desktop.
Program to find cosine similaity
import spacy
import textstat
from textstat.textstat import textstatistics
import pandas as pd
import pysentiment2 as ps
import math
import re
from collections import Counter
import numpy as np
df1 = pd.read_csv( "/Final_Data_Tsv_form.tsv", sep="\t")
unique =[]
for index, row in df1.iterrows():
iden = str(row['series']).strip()+str(row['class']).strip()+str(row['tag']).strip()+str(row['form']).strip()+str(row['cik']).strip()
unique.append(iden)
df1['unique_identifier_form'] = unique
df1['value_lag'] = df1.groupby('unique_identifier_form')['value'].shift()
df1.value_lag = df1.value_lag.fillna('')
print("done")
WORD = re.compile(r"\w+")
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
df1['vector1']=df1['value'].apply(lambda x: text_to_vector(str(x)))
df1['vector2']=df1['value_lag'].apply(lambda x: text_to_vector(str(x)))
df1['cos_Sim_score']=df1.apply(lambda x: get_cosine(x['vector1'],x['vector2']),axis=1)
df1.to_csv("/home/singh.shreya1/Task-Wang/cos_sample_form.tsv",sep ="\t")
#part where only cosine similarity of not same value_lag will be considered.
df1 = pd.read_csv( "/cos_sample_form.tsv", sep="\t")
cos_sim=[]
comparison_column = np.where(df1["value_lag"].isnull(), True, False)
df1["is_equal"] = comparison_column
for index, row in df1.iterrows():
if(row['is_equal']):
cos_sim.append(" ")
else:
cos_sim.append(row['cos_Sim_score'])
df1['cos_sim_score'] = cos_sim
select_col = df1[['id','adsh','series','class','tag','cik','filed','form','cos_sim_score']]
df2 = select_col.copy()
df2.to_csv( "/Final_Data_Tsv_with_cos_sim.tsv", sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment