sallos-cyber/preprocess_flows.py

## preprocess_flows.py

def callPreprocessData(fn='/home/someusr/Downloads/flows.csv'):
    data=pd.read_csv(fn)
    print(data.head(3))
    data&#91;'Bytes']=data&#91;'Bytes'].fillna(0)
    data&#91;'Bytes']=data&#91;'Bytes'].astype(str)
    #silly nfdump writes bytes as integer but sometimes it converts it to
    #mb. The following finds those entries and converts them into bytes.
    data.loc&#91;data&#91;'Bytes'].str.contains('M'),'Bytes'] =   data&#91;data&#91;'Bytes'].str.contains('M')]&#91;'Bytes'].apply(lambda x: float(x&#91;1:-2])*1024*1024)
    data&#91;'Bytes']=data&#91;'Bytes'].astype(int)
    data.dropna(subset=&#91;'Datefirstseen'],inplace=True)
    data&#91;'Datefirstseen']=pd.to_datetime(data&#91;'Datefirstseen'])
    data=data.set_index('Datefirstseen')

    data&#91;'Duration']=data&#91;'Duration'].astype(int)
    data&#91;'DstPt']=data&#91;'DstPt'].astype(int)
    data&#91;'Datefirstseenunix']=data&#91;'Datefirstseenunix'].astype(int)

    #remove all white spaces from all columns:
    data&#91;'SrcIPAddr']=data&#91;'SrcIPAddr'].astype(str).str.strip()
    data&#91;'DstIPAddr']=data&#91;'DstIPAddr'].astype(str).str.strip()
    data&#91;'Proto']=data&#91;'Proto'].astype(str).str.strip()

    print('I am now saving the file')
    print(data.head(3))

    data.to_csv('flows_processed.csv')

	def callPreprocessData(fn='/home/someusr/Downloads/flows.csv'):
	data=pd.read_csv(fn)
	print(data.head(3))
	data['Bytes']=data['Bytes'].fillna(0)
	data['Bytes']=data['Bytes'].astype(str)
	#silly nfdump writes bytes as integer but sometimes it converts it to
	#mb. The following finds those entries and converts them into bytes.
	data.loc[data['Bytes'].str.contains('M'),'Bytes'] = data[data['Bytes'].str.contains('M')]['Bytes'].apply(lambda x: float(x[1:-2])10241024)
	data['Bytes']=data['Bytes'].astype(int)
	data.dropna(subset=['Datefirstseen'],inplace=True)
	data['Datefirstseen']=pd.to_datetime(data['Datefirstseen'])
	data=data.set_index('Datefirstseen')

	data['Duration']=data['Duration'].astype(int)
	data['DstPt']=data['DstPt'].astype(int)
	data['Datefirstseenunix']=data['Datefirstseenunix'].astype(int)

	#remove all white spaces from all columns:
	data['SrcIPAddr']=data['SrcIPAddr'].astype(str).str.strip()
	data['DstIPAddr']=data['DstIPAddr'].astype(str).str.strip()
	data['Proto']=data['Proto'].astype(str).str.strip()

	print('I am now saving the file')
	print(data.head(3))

	data.to_csv('flows_processed.csv')