Ednalyn C. De Dios ecdedios

## README.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ecdedios
                / README.md
            
            
              Created
              May 4, 2017 08:31
                — forked from zenorocha/README.md
            
              
                A template for Github READMEs (Markdown) + Sublime Snippet
              
          
    Project Name

TODO: Write a project description
Installation

TODO: Describe the installation process
Usage


## show_missing.py
def show_missing(df):
    """
    Return the total missing values and the percentage of
    missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())

## npr_coronavirus_extract.py
import requests
import json
import time
import newspaper
import pickle

npr = newspaper.build('https://www.npr.org/sections/coronavirus-live-updates')

corpus = []
count = 0

## fuzzywuzzy_early_attempt
choices = set([item for sublist in articles for item in sublist])

cleaned_articles = []
for article in articles:
    article_entities = []
    for entity in set(article):
        article_entities.append(process.extractOne(entity, choices)[0])
    cleaned_articles.append(article_entities)

## ngram_functions.py
def clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')

## joblib_parallel_processing.py
from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):

## manual_combined_ner_freq.py
entity_counts = []

entity_counts.append(('Democrats', df_counts.loc[df_counts.entity.isin(['Democrats', 'Dems', 'Democrat'])]['count'].sum()))
entity_counts.append(('Americans', df_counts.loc[df_counts.entity.isin(['American', 'Americans'])]['count'].sum()))
entity_counts.append(('Congress', df_counts.loc[df_counts.entity.isin(['House', 'Senate', 'Congress'])]['count'].sum()))
entity_counts.append(('America', df_counts.loc[df_counts.entity.isin(['U.S.', 'the United States', 'America'])]['count'].sum()))
entity_counts.append(('Republicans', df_counts.loc[df_counts.entity.isin(['Republican', 'Republicans'])]['count'].sum()))

entity_counts.append(('China', 533))
entity_counts.append(('FBI', 316))

## import_pandas_display.py
import pandas as pd

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

## read_311.py
df = pd.read_csv('allservicecalls.csv')

df.head()
df.info()

show_missing(df)

## isnull_notnull.py
df['Dept'].value_counts(dropna=False)

df_null = df.loc[df['Dept'].isnull()]
df_null.head()
df_null.shape

df_notnull = df.loc[df['Dept'].notnull()]
df_notnull.head()
df_notnull.shape
	def show_missing(df):
	"""
	Return the total missing values and the percentage of
	missing values by column.
	"""
	null_count = df.isnull().sum()
	null_percentage = (null_count / df.shape[0]) * 100
	empty_count = pd.Series(((df == ' ') \| (df == '')).sum())
	empty_percentage = (empty_count / df.shape[0]) * 100
	nan_count = pd.Series(((df == 'nan') \| (df == 'NaN')).sum())
	import requests
	import json
	import time
	import newspaper
	import pickle

	npr = newspaper.build('https://www.npr.org/sections/coronavirus-live-updates')

	corpus = []
	count = 0
	choices = set([item for sublist in articles for item in sublist])

	cleaned_articles = []
	for article in articles:
	article_entities = []
	for entity in set(article):
	article_entities.append(process.extractOne(entity, choices)[0])
	cleaned_articles.append(article_entities)
	def clean(text):
	"""
	A simple function to clean up the data. All the words that
	are not designated as a stop word is then lemmatized after
	encoding and basic regex parsing are performed.
	"""
	wnl = nltk.stem.WordNetLemmatizer()
	stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
	text = (unicodedata.normalize('NFKD', text)
	.encode('ascii', 'ignore')
	from joblib import Parallel, delayed

	def chunker(iterable, total_length, chunksize):
	return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

	def flatten(list_of_lists):
	"Flatten a list of lists to a combined list"
	return [item for sublist in list_of_lists for item in sublist]

	def process_chunk(texts):
	entity_counts = []

	entity_counts.append(('Democrats', df_counts.loc[df_counts.entity.isin(['Democrats', 'Dems', 'Democrat'])]['count'].sum()))
	entity_counts.append(('Americans', df_counts.loc[df_counts.entity.isin(['American', 'Americans'])]['count'].sum()))
	entity_counts.append(('Congress', df_counts.loc[df_counts.entity.isin(['House', 'Senate', 'Congress'])]['count'].sum()))
	entity_counts.append(('America', df_counts.loc[df_counts.entity.isin(['U.S.', 'the United States', 'America'])]['count'].sum()))
	entity_counts.append(('Republicans', df_counts.loc[df_counts.entity.isin(['Republican', 'Republicans'])]['count'].sum()))

	entity_counts.append(('China', 533))
	entity_counts.append(('FBI', 316))
	import pandas as pd

	# to print out all the outputs
	from IPython.core.interactiveshell import InteractiveShell
	InteractiveShell.ast_node_interactivity = "all"

	# set display options
	pd.set_option('display.max_columns', None)
	pd.set_option('display.max_rows', None)
	pd.set_option('display.max_colwidth', -1)
	df = pd.read_csv('allservicecalls.csv')

	df.head()
	df.info()

	show_missing(df)
	df['Dept'].value_counts(dropna=False)

	df_null = df.loc[df['Dept'].isnull()]
	df_null.head()
	df_null.shape

	df_notnull = df.loc[df['Dept'].notnull()]
	df_notnull.head()
	df_notnull.shape