Skip to content

Instantly share code, notes, and snippets.

@shreya-singh-tech
Created August 14, 2021 23:35
Show Gist options
  • Save shreya-singh-tech/c1de6c68fa875f3c92ba263c7db89240 to your computer and use it in GitHub Desktop.
Save shreya-singh-tech/c1de6c68fa875f3c92ba263c7db89240 to your computer and use it in GitHub Desktop.
# read csv data
df1 = pd.read_csv('/Data_2010_Tsv_all.tsv', sep = '\t')
df1.reset_index(inplace=True)
df1 = df1.rename(columns = {'index':'id'})
value_length_list =[]
for index, row in df1.iterrows():
value_length_list.append(len(str(row['value']).strip()))
df1['value_length'] = value_length_list
#filter unique value
unique =[]
for index, row in df1.iterrows():
iden = str(row['adsh']).strip()+str(row['series']).strip()+str(row['class']).strip()+str(row['tag']).strip()+str(row['cik']).strip()
unique.append(iden)
df1['unique_identifier'] = unique
df1 = df1.sort_values(['series','class','tag','form','filed','cik','adsh','value_length'],ascending=[True,True,True, True, True, True,True,False])
df2 = df1.drop_duplicates(subset='unique_identifier')
#filter series col
series_length_list = []
for index, row in df2.iterrows():
le = len(str(row['series']).strip())
series_length_list.append(le)
df2['series_length'] = series_length_list
df_fil = df2[df2['series_length'] == 10]
df_fil.to_csv( "/Final_Data_Tsv_form.tsv", sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment