ivopbernardo

## nltk_intro.py
# Getting started with NLTK scripts - used in blog post:
# https://towardsdatascience.com/getting-started-with-nltk-eb4ed6eb7a37

from nltk import tokenize

python_wiki = '''
Python is a high-level, interpreted, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely

## decisiontree.R
# Training a decision tree in R - used in blog post:
# https://medium.com/codex/data-science-tutorials-training-a-decision-tree-using-r-d6266936d86

library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)
library(Metrics)
library(ggplot2)

## geoprocess_dd_post.py
# Getting Latitude and Longitude from Nominatim

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geocoder = Nominatim(user_agent="FindAddress")
geocode = RateLimiter(
    geocoder.geocode,
    min_delay_seconds = 1,
    return_value_on_exception = None

## xgboostr.r
# Training an XGBoost in R - used in blog post:
# https://towardsdatascience.com/data-science-tutorials-training-an-xgboost-using-r-cf3c00b1425

library(dplyr)
library(xgboost)
library(Metrics)
library(ggplot2)

# Load london bike csv
london_bike <- read.csv('./london_merged.csv')

## randomforests.r
# Training a Random Forest in R - used in blog post:
# https://towardsdatascience.com/data-science-tutorials-training-a-random-forest-in-r-a883cc1bacd1

library(dplyr)
library(randomForest)
library(ranger)
library(Metrics)

# Load london bike csv
london_bike <- read.csv('./london_merged.csv')

## rf_demo.R
# Don't forget to download the train.csv file
# to make this gist work.

# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv

# You also need to install ROCR and rpart libraries

# Reading the titanic train dataset
titanic <- read.csv('./train.csv')

## cooccurrence_example.py
import wikipedia
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_page(page_name: str) -> list:
    '''
    Retrieves page data from wikipedia

## stemming_example.py
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
lanc = LancasterStemmer()

sentence_example = (
  'This is definitely a controversy as the attorney labeled the case "extremely controversial"'
)

## text_representation.py
# Import sklearn vectorizers and pandas
import pandas as pd
from sklearn.feature_extraction.text import (
  CountVectorizer,
  TfidfVectorizer
)


# Defining our sentence examples
sentence_list = [

## cleaning_data.R
# Loading readxl library
library(readxl)

clean_crime_data <- function(path) {
  # Load the Data
  crime_data <- read_xls(path)

  # Assigning colnames
  colnames(crime_data) <- crime_data[3,]
	# Getting started with NLTK scripts - used in blog post:
	# https://towardsdatascience.com/getting-started-with-nltk-eb4ed6eb7a37

	from nltk import tokenize

	python_wiki = '''
	Python is a high-level, interpreted, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
	Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.
	Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.[33] Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely
	# Training a decision tree in R - used in blog post:
	# https://medium.com/codex/data-science-tutorials-training-a-decision-tree-using-r-d6266936d86

	library(dplyr)
	library(rpart)
	library(rpart.plot)
	library(caret)
	library(Metrics)
	library(ggplot2)
	# Getting Latitude and Longitude from Nominatim

	from geopy.geocoders import Nominatim
	from geopy.extra.rate_limiter import RateLimiter

	geocoder = Nominatim(user_agent="FindAddress")
	geocode = RateLimiter(
	geocoder.geocode,
	min_delay_seconds = 1,
	return_value_on_exception = None
	# Training an XGBoost in R - used in blog post:
	# https://towardsdatascience.com/data-science-tutorials-training-an-xgboost-using-r-cf3c00b1425

	library(dplyr)
	library(xgboost)
	library(Metrics)
	library(ggplot2)

	# Load london bike csv
	london_bike <- read.csv('./london_merged.csv')
	# Training a Random Forest in R - used in blog post:
	# https://towardsdatascience.com/data-science-tutorials-training-a-random-forest-in-r-a883cc1bacd1

	library(dplyr)
	library(randomForest)
	library(ranger)
	library(Metrics)

	# Load london bike csv
	london_bike <- read.csv('./london_merged.csv')
	# Don't forget to download the train.csv file
	# to make this gist work.

	# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv

	# You also need to install ROCR and rpart libraries

	# Reading the titanic train dataset
	titanic <- read.csv('./train.csv')
	import wikipedia
	import pandas as pd
	import numpy as np
	import string
	from nltk.tokenize import word_tokenize
	from sklearn.metrics.pairwise import cosine_similarity

	def retrieve_page(page_name: str) -> list:
	'''
	Retrieves page data from wikipedia
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

	porter = PorterStemmer()
	snowball = SnowballStemmer(language='english')
	lanc = LancasterStemmer()

	sentence_example = (
	'This is definitely a controversy as the attorney labeled the case "extremely controversial"'
	)
	# Import sklearn vectorizers and pandas
	import pandas as pd
	from sklearn.feature_extraction.text import (
	CountVectorizer,
	TfidfVectorizer
	)


	# Defining our sentence examples
	sentence_list = [
	# Loading readxl library
	library(readxl)

	clean_crime_data <- function(path) {
	# Load the Data
	crime_data <- read_xls(path)

	# Assigning colnames
	colnames(crime_data) <- crime_data[3,]