Skip to content

Instantly share code, notes, and snippets.

@porimol
Created December 8, 2017 08:42
Show Gist options
  • Save porimol/57c1a10a0624166aa8aff0218d797d73 to your computer and use it in GitHub Desktop.
Save porimol/57c1a10a0624166aa8aff0218d797d73 to your computer and use it in GitHub Desktop.
import pandas as pd
data = pd.read_csv('test.csv')
# array3 = data['Column2'].replace(np.NaN,-1)
# length = array3.shape[0]
# print(length)
# arr3 = [];
# for i in range(length):
# if array3[i] != -1:
# # print(i)
# arr3.append(array[i])
#
# print(arr3)
# trimmed = st.trim_mean(arr3,.05)
# mask=(data['Column1']=='Yes') & ((data['Column2'=='NA']) |(data['Column2'=='No'])
# data.loc[mask,'Column2']=pd.to_numeric(data['Column2'],errors='coerce').fillna(trimmed)
data['Column2'] = data['Column2'].fillna(data.apply(lambda cols: round(data['Column2'].mean(),2) if cols[0]=='yes' else -1, axis=1))
data.to_csv(r'data.csv', index=False)
@iprocheta
Copy link

where will I assign column1?? if I put data['column1'] in the place of cols[O] its getting error. I want to make it dynamic so that I can compare column2 with any column, don't defined only for zero index column.

@iprocheta
Copy link

In my code I had calculated trimmed mean of the values of column2. I want to replace with that trimmed mean.

@iprocheta
Copy link

https://pastebin.com/kGQ6T3ET

getting syntax error ,why??

@porimol
Copy link
Author

porimol commented Dec 9, 2017

@pp227,
can you show me the error(s)?

@iprocheta
Copy link

File "", line 14
data.loc[mask,'Cholesterol medicine years']=pd.to_numeric(data['Cholesterol medicine years'],errors='coerce').fillna(trimmed)
^
SyntaxError: invalid syntax

@iprocheta
Copy link

capture

@porimol
Copy link
Author

porimol commented Dec 9, 2017

@pp227 have a look. I added here the solution with two sample output.

from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
    if array3[i] != -1:
        arr3.append(array3[i])  

# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)

# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data

Sample Output 1

Hypertension Hypertension_medicine_years Diabetes Diabetes_medicine_years
0 Yes 1 No NaN
1 Yes 6 Yes 13.00
2 Yes VALID No NaN
3 NaN NaN NaN NaN
4 Yes 2 No NaN
5 Yes 6 No NaN
6 Yes 14 Yes 14.00
7 Yes 5 No NaN
8 Yes 0.5 Yes 2.00
9 No NaN No NaN
10 Yes VALID Yes 3.00
11 No NaN No NaN
12 No NaN No NaN
13 Yes VALID No NaN
14 No NaN No NaN
15 Yes 2 No NaN
16 No NaN No NaN
17 No NaN No NaN
18 Yes VALID Yes 7.00
19 Yes VALID Yes 5.95
20 No NaN Yes 7.00
21 No NaN No NaN
22 Yes 4 No NaN
23 No NaN Yes 3.00
24 No NaN No NaN
25 Yes 2 Yes 5.95
26 Yes 5 No NaN
27 Yes 3 Yes 2.00
28 Yes 10 No NaN
29 Yes 2 Yes 8.00
30 Yes 0.5 Yes 0.50
31 No NaN No NaN
32 Yes 10 No NaN
33 No NaN No NaN
from scipy import stats as st
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
data = pd.read_csv('training.csv')
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].replace('VALID', np.NaN)
data['Diabetes_medicine_years'] = data.Diabetes_medicine_years.astype(float)
array3 = data['Diabetes_medicine_years'].replace(np.NaN,-1)
length = array3.shape[0]
# print(length)
arr3 = [];
for i in range(length):
    if array3[i] != -1:
        arr3.append(array3[i])  

# print(arr3)
trimmed = st.trim_mean(arr3, .05)
# mask=(data['Diabetes']=='Yes') & ((data['Diabetes_medicine_years'=='NA']) |(data['Diabetes_medicine_years'=='No']|(data['Diabetes_medicine_years'=='VALID'])))
# data.loc[mask,'Diabetes_medicine_years']=pd.to_numeric(data['Diabetes_medicine_years'],errors='coerce').fillna(16.52)
# # data.to_csv(r'data.csv')
# data.head(5)

# if Diabetes == 'yes and Diabetes_medicine_years == 'no' or 'na' else "Diabetes_medicine_years"'s value will insert into Diabetes_medicine_years field
# data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else cols.Diabetes_medicine_years, axis=1))
# if diabetes not equal 'yes' and you may want to add empty string into 'Diabetes_medicine_years' then you may uncomment this line.
data['Diabetes_medicine_years'] = data['Diabetes_medicine_years'].fillna(data.apply(lambda cols: trimmed if str(cols.Diabetes).lower() == 'yes' and (str(cols.Diabetes_medicine_years).lower() == 'nan' or str(cols.Diabetes_medicine_years).lower() == 'no') else '', axis=1))
data.to_csv('data.csv', index=False)
data

Sample Output 2

Hypertension Hypertension_medicine_years Diabetes Diabetes_medicine_years
0 Yes 1 No
1 Yes 6 Yes 13
2 Yes VALID No
3 NaN NaN NaN
4 Yes 2 No
5 Yes 6 No
6 Yes 14 Yes 14
7 Yes 5 No
8 Yes 0.5 Yes 2
9 No NaN No
10 Yes VALID Yes 3
11 No NaN No
12 No NaN No
13 Yes VALID No
14 No NaN No
15 Yes 2 No
16 No NaN No
17 No NaN No
18 Yes VALID Yes 7
19 Yes VALID Yes 5.95
20 No NaN Yes 7
21 No NaN No
22 Yes 4 No
23 No NaN Yes 3
24 No NaN No
25 Yes 2 Yes 5.95
26 Yes 5 No
27 Yes 3 Yes 2
28 Yes 10 No
29 Yes 2 Yes 8
30 Yes 0.5 Yes 0.5
31 No NaN No
32 Yes 10 No
33 No NaN No

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment