Skip to content

Instantly share code, notes, and snippets.

@sallos-cyber
Last active February 21, 2022 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sallos-cyber/a410c8986eec29b14e2c9d039cab5d56 to your computer and use it in GitHub Desktop.
Save sallos-cyber/a410c8986eec29b14e2c9d039cab5d56 to your computer and use it in GitHub Desktop.
def callPreprocessData(fn='/home/someusr/Downloads/flows.csv'):
data=pd.read_csv(fn)
print(data.head(3))
data['Bytes']=data['Bytes'].fillna(0)
data['Bytes']=data['Bytes'].astype(str)
#silly nfdump writes bytes as integer but sometimes it converts it to
#mb. The following finds those entries and converts them into bytes.
data.loc[data['Bytes'].str.contains('M'),'Bytes'] = data[data['Bytes'].str.contains('M')]['Bytes'].apply(lambda x: float(x[1:-2])*1024*1024)
data['Bytes']=data['Bytes'].astype(int)
data.dropna(subset=['Datefirstseen'],inplace=True)
data['Datefirstseen']=pd.to_datetime(data['Datefirstseen'])
data=data.set_index('Datefirstseen')
data['Duration']=data['Duration'].astype(int)
data['DstPt']=data['DstPt'].astype(int)
data['Datefirstseenunix']=data['Datefirstseenunix'].astype(int)
#remove all white spaces from all columns:
data['SrcIPAddr']=data['SrcIPAddr'].astype(str).str.strip()
data['DstIPAddr']=data['DstIPAddr'].astype(str).str.strip()
data['Proto']=data['Proto'].astype(str).str.strip()
print('I am now saving the file')
print(data.head(3))
data.to_csv('flows_processed.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment