Skip to content

Instantly share code, notes, and snippets.

View elliotgunn's full-sized avatar

Elliot Gunn elliotgunn

View GitHub Profile
import dash
import dash_deck.DeckGL as dgl
import pydeck
app = dash.Dash()
# Read Data
DATA_URL = "https://raw.githubusercontent.com/visgl/deck.gl-data/master/examples/geojson/vancouver-blocks.json"
# Generate polygon layer
import plotly.express as px
import geopandas as gpd
import shapely.geometry
import numpy as np
import wget
# download a zipped shapefile
wget.download(
"https://plotly.github.io/datasets/ne_50m_rivers_lake_centerlines.zip"
)
import numpy as np
import pandas as pd
import dask.dataframe as dd
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import train_test_split as tts
# read in encoded dataframe from s3
df = dd.read_csv('s3://folder/subfolder/trade_encoded.csv/*.csv')
# reads df to memory
from iso3166 import countries
# load trade data
df_trade = spark.read.options(header=True, inferSchema=True)
.parquet('s3://folder/subfolder/trade_data_clean.parquet')
# load known arms traders
df_arms = spark.read.options(header=True, inferSchema=True)\
.parquet('s3://folder/subfolder/arms_traders_clean.parquet')
# creates a list of all CONSIGNEE_COUNTRY values for known arms dealers
temp_df = df_arms.select('CONSIGNEE_COUNTRY').distinct().collect()
arms_CONSIGNEE_COUNTRY = [temp_df[i]['CONSIGNEE_COUNTRY'] for i in range(len(temp_df))]
del(temp_df)
# creates a new dataframe for all the encoded columns
df_encoded = df_trade
# create user defined functions to apply to each column
func_CONSIGNEE_COUNTRY = F.udf(lambda x: 1 if (x in arms_CONSIGNEE_COUNTRY) else 0)
# defines regex expressions to apply to trade data
regexINN = '(\d{8,12}|None|null|0|00)'
regexNOTDIGIT = '[^0-9]'
regexADDRESS = '(null|None)|(\b[a-zA-z]{1,3}\b)|(\d{2,})'
regex2CHAR = '(None|[a-zA-Z]{2}|\d{2})'
regexDATE = '(None|null|\d{4}-\d{2}-\d{2})'
# applying regex to the trade data
df_trade = df_trade.filter(df_trade['CONSIGNOR_NAME'].rlike(regexNOTDIGIT))\
.filter(df_trade['DECLARATION_NUMBER'].rlike(regexDECLARATION_NUMBER))\