Skip to content

Instantly share code, notes, and snippets.

View ivopbernardo's full-sized avatar

ivopbernardo

View GitHub Profile
@ivopbernardo
ivopbernardo / sakila_data_manipulation.py
Created August 20, 2019 19:44
Examples of extracting data using Pandas
import pandas as pd
import pyodbc
#Ask for user and password input
user = input('Provide user: \n')
pwd = input('Provide password: \n')
#Make connection to My SQL local host
mydb = pyodbc.connect("DRIVER={MySQL ODBC 8.0 ANSI Driver}; SERVER=localhost; PORT=3306;DATABASE=sakila; UID=%s; PASSWORD=%s;" % (user, pwd))
@ivopbernardo
ivopbernardo / cleaning_data.R
Last active January 3, 2021 13:41
cleaning FBI crime data
# Loading readxl library
library(readxl)
clean_crime_data <- function(path) {
# Load the Data
crime_data <- read_xls(path)
# Assigning colnames
colnames(crime_data) <- crime_data[3,]
@ivopbernardo
ivopbernardo / text_representation.py
Created April 23, 2021 16:10
Python Text Representation
# Import sklearn vectorizers and pandas
import pandas as pd
from sklearn.feature_extraction.text import (
CountVectorizer,
TfidfVectorizer
)
# Defining our sentence examples
sentence_list = [
@ivopbernardo
ivopbernardo / stemming_example.py
Last active May 18, 2021 16:51
Examples around NLTK stemming
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
lanc = LancasterStemmer()
sentence_example = (
'This is definitely a controversy as the attorney labeled the case "extremely controversial"'
)
@ivopbernardo
ivopbernardo / cooccurrence_example.py
Created August 16, 2021 12:49
word_vectors_cooccurrence
import wikipedia
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
def retrieve_page(page_name: str) -> list:
'''
Retrieves page data from wikipedia
@ivopbernardo
ivopbernardo / rf_demo.R
Created February 4, 2022 18:18
Random Forests vs. Decision Trees
# Don't forget to download the train.csv file
# to make this gist work.
# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv
# You also need to install ROCR and rpart libraries
# Reading the titanic train dataset
titanic <- read.csv('./train.csv')
@ivopbernardo
ivopbernardo / geoprocess_dd_post.py
Last active March 11, 2022 14:00
Locate your Data and Boost it with Geo-Processing Post
# Getting Latitude and Longitude from Nominatim
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geocoder = Nominatim(user_agent="FindAddress")
geocode = RateLimiter(
geocoder.geocode,
min_delay_seconds = 1,
return_value_on_exception = None
@ivopbernardo
ivopbernardo / convert_dataframe_to_geodataframe.py
Created May 5, 2022 21:34
DareData Blog Post about GeoData
import geopandas as gpd
house_data_gdf = gpd.GeoDataFrame(
house_data,
geometry=gpd.points_from_xy(
house_data.longitude,
house_data.latitude
),
crs="epsg:4326",
)
# Read data directly from the portuguese gov website.
parishes_url = "zip+https://dados.gov.pt/s/resources/freguesias-de-portugal/20181112-195834/cont-aad-caop2017.zip"
parishes = gpd.read_file(parishes_url)
# Left Join the house data to the parishes data, if house is `within` parish.
house_data_gdf = gpd.sjoin(house_data_gdf, parishes, how="left", op="within")
# Public Hospitals in Lisbon
hospitals_url = "https://opendata.arcgis.com/datasets/172678f193144512860a397fde991361_4.geojson" # GeoJSON
hospitals_gdf = gpd.read_file(hospitals_url).to_crs(epsg=3857)
hospitals_gdf.head()
# Buffer the house locations by 1km
house_data_gdf_buffer = (
house_data_gdf
.copy()
.assign(geometry_buffer = lambda d: d.buffer(1000))