Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created September 27, 2020 15:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amankharwal/5f90fed3c60a06515f60a7330df24777 to your computer and use it in GitHub Desktop.
Save amankharwal/5f90fed3c60a06515f60a7330df24777 to your computer and use it in GitHub Desktop.
repl_list = ['s/o','d/o','w/o','/','&',',','-']
def clean_data(name):
name = str(name).lower()
name = (''.join(i for i in name if ord(i)<128)).strip()
for repl in repl_list:
name = name.replace(repl," ")
if '@' in name:
pos = name.find('@')
name = name[:pos].strip()
name = name.split(" ")
name = " ".join([each.strip() for each in name])
return name
def remove_records(merged_data):
merged_data['delete'] = 0
merged_data.loc[merged_data['name'].str.find('with') != -1,'delete'] = 1
merged_data.loc[merged_data['count_words']>=5,'delete']=1
merged_data.loc[merged_data['count_words']==0,'delete']=1
merged_data.loc[merged_data['name'].str.contains(r'\d') == True,'delete']=1
cleaned_data = merged_data[merged_data.delete==0]
return cleaned_data
merged_data = pd.concat((male_data,female_data),axis=0)
merged_data['name'] = merged_data['name'].apply(clean_data)
merged_data['count_words'] = merged_data['name'].str.split().apply(len)
cleaned_data = remove_records(merged_data)
indian_cleaned_data = cleaned_data[['name','count_words']].drop_duplicates(subset='name',keep='first')
indian_cleaned_data['label'] = 'indian'
len(indian_cleaned_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment