Skip to content

Instantly share code, notes, and snippets.

View ecdedios's full-sized avatar

Ednalyn C. De Dios ecdedios

View GitHub Profile
@ecdedios
ecdedios / README.md
Created May 4, 2017 08:31 — forked from zenorocha/README.md
A template for Github READMEs (Markdown) + Sublime Snippet

Project Name

TODO: Write a project description

Installation

TODO: Describe the installation process

Usage

@ecdedios
ecdedios / show_missing.py
Created May 25, 2020 02:07
A function to show total number of missing values and their percentages.
def show_missing(df):
"""
Return the total missing values and the percentage of
missing values by column.
"""
null_count = df.isnull().sum()
null_percentage = (null_count / df.shape[0]) * 100
empty_count = pd.Series(((df == ' ') | (df == '')).sum())
empty_percentage = (empty_count / df.shape[0]) * 100
nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
@ecdedios
ecdedios / npr_coronavirus_extract.py
Created May 25, 2020 17:46
Get coronavirus-related articles from npr.org using the newspaper library.
import requests
import json
import time
import newspaper
import pickle
npr = newspaper.build('https://www.npr.org/sections/coronavirus-live-updates')
corpus = []
count = 0
@ecdedios
ecdedios / fuzzywuzzy_early_attempt
Created May 25, 2020 19:05
An early attempt at using fuzzywuzzy.
choices = set([item for sublist in articles for item in sublist])
cleaned_articles = []
for article in articles:
article_entities = []
for entity in set(article):
article_entities.append(process.extractOne(entity, choices)[0])
cleaned_articles.append(article_entities)
@ecdedios
ecdedios / ngram_functions.py
Last active May 30, 2020 20:12
Basic cleaning and n-gram helper functions.
def clean(text):
"""
A simple function to clean up the data. All the words that
are not designated as a stop word is then lemmatized after
encoding and basic regex parsing are performed.
"""
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
text = (unicodedata.normalize('NFKD', text)
.encode('ascii', 'ignore')
@ecdedios
ecdedios / joblib_parallel_processing.py
Created May 31, 2020 03:20
Using joblib to process chunks in parallel.
from joblib import Parallel, delayed
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_chunk(texts):
@ecdedios
ecdedios / manual_combined_ner_freq.py
Last active May 31, 2020 03:48
Manually constructing a list of tuples.
entity_counts = []
entity_counts.append(('Democrats', df_counts.loc[df_counts.entity.isin(['Democrats', 'Dems', 'Democrat'])]['count'].sum()))
entity_counts.append(('Americans', df_counts.loc[df_counts.entity.isin(['American', 'Americans'])]['count'].sum()))
entity_counts.append(('Congress', df_counts.loc[df_counts.entity.isin(['House', 'Senate', 'Congress'])]['count'].sum()))
entity_counts.append(('America', df_counts.loc[df_counts.entity.isin(['U.S.', 'the United States', 'America'])]['count'].sum()))
entity_counts.append(('Republicans', df_counts.loc[df_counts.entity.isin(['Republican', 'Republicans'])]['count'].sum()))
entity_counts.append(('China', 533))
entity_counts.append(('FBI', 316))
@ecdedios
ecdedios / import_pandas_display.py
Created May 31, 2020 15:34
Importing pandas and setting the display options.
import pandas as pd
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)
@ecdedios
ecdedios / read_311.py
Last active May 31, 2020 15:51
Loading in the San Antonio 311 Service Calls into a dataframe.
df = pd.read_csv('allservicecalls.csv')
df.head()
df.info()
show_missing(df)
@ecdedios
ecdedios / isnull_notnull.py
Last active May 31, 2020 16:06
Selecting rows where the column is null or not.
df['Dept'].value_counts(dropna=False)
df_null = df.loc[df['Dept'].isnull()]
df_null.head()
df_null.shape
df_notnull = df.loc[df['Dept'].notnull()]
df_notnull.head()
df_notnull.shape