Skip to content

Instantly share code, notes, and snippets.

View zaynaib's full-sized avatar
🐢
Slowly learn D3.js

Zaynaib (Ola) Giwa zaynaib

🐢
Slowly learn D3.js
View GitHub Profile
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.akc.org/reg/dogreg_stats.cfm"
r=requests.get(url)
data= r.text
soup = BeautifulSoup(data)
table = soup.find_all('table')[1]
rows = table.find_all('tr')
@zaynaib
zaynaib / tf_idf.py
Last active August 29, 2015 14:11 — forked from vineetrok/tf_idf.py
import glob
import math
line=''
s=set()
flist=glob.glob(r'E:\PROGRAMMING\PYTHON\programs\corpus2\*.txt') #get all the files from the d`#open each file >> tokenize the content >> and store it in a set
for fname in flist:
tfile=open(fname,"r")
line=tfile.read() # read the content of file and store in "line"
tfile.close() # close the file
s=s.union(set(line.split(' '))) # union of common words
@zaynaib
zaynaib / tfidf.py
Last active August 29, 2015 14:11 — forked from sloria/tfidf.py
import math
from text.blob import TextBlob as tb
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
# Given a list of words, remove any that are
# in a list of stop words.
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
@zaynaib
zaynaib / tf_idf.py
Last active August 29, 2015 14:11 — forked from vineetrok/tf_idf.py
import glob
import math
line=''
s=set()
flist=glob.glob(r'E:\PROGRAMMING\PYTHON\programs\corpus2\*.txt') #get all the files from the d`#open each file >> tokenize the content >> and store it in a set
for fname in flist:
tfile=open(fname,"r")
line=tfile.read() # read the content of file and store in "line"
tfile.close() # close the file
s=s.union(set(line.split(' '))) # union of common words
@zaynaib
zaynaib / idf.py
Created December 21, 2014 19:59
Code to calculate the inverse document frequence
from string import digits
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import glob
import math
import csv
import string
from collections import Counter
library(twitteR)
library(tm)
library(wordcloud)
library(RColorBrewer)
midnight <- searchTwitter("#MakeAMovieSmarter", n = 1000)
midnight_text = sapply(midnight, function(x) x$getText())
midnight_corpus = Corpus(VectorSource(midnight_text))
tdm = TermDocumentMatrix(
@zaynaib
zaynaib / Twitch.markdown
Last active August 29, 2015 14:27
Twitch