This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import pandas as pd | |
url = "https://www.akc.org/reg/dogreg_stats.cfm" | |
r=requests.get(url) | |
data= r.text | |
soup = BeautifulSoup(data) | |
table = soup.find_all('table')[1] | |
rows = table.find_all('tr') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import math | |
line='' | |
s=set() | |
flist=glob.glob(r'E:\PROGRAMMING\PYTHON\programs\corpus2\*.txt') #get all the files from the d`#open each file >> tokenize the content >> and store it in a set | |
for fname in flist: | |
tfile=open(fname,"r") | |
line=tfile.read() # read the content of file and store in "line" | |
tfile.close() # close the file | |
s=s.union(set(line.split(' '))) # union of common words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from text.blob import TextBlob as tb | |
def tf(word, blob): | |
return blob.words.count(word) / len(blob.words) | |
def n_containing(word, bloblist): | |
return sum(1 for blob in bloblist if word in blob) | |
def idf(word, bloblist): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given a list of words, remove any that are | |
# in a list of stop words. | |
def removeStopwords(wordlist, stopwords): | |
return [w for w in wordlist if w not in stopwords] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import math | |
line='' | |
s=set() | |
flist=glob.glob(r'E:\PROGRAMMING\PYTHON\programs\corpus2\*.txt') #get all the files from the d`#open each file >> tokenize the content >> and store it in a set | |
for fname in flist: | |
tfile=open(fname,"r") | |
line=tfile.read() # read the content of file and store in "line" | |
tfile.close() # close the file | |
s=s.union(set(line.split(' '))) # union of common words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from string import digits | |
import nltk | |
from nltk import word_tokenize | |
from nltk.corpus import stopwords | |
import glob | |
import math | |
import csv | |
import string | |
from collections import Counter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(twitteR) | |
library(tm) | |
library(wordcloud) | |
library(RColorBrewer) | |
midnight <- searchTwitter("#MakeAMovieSmarter", n = 1000) | |
midnight_text = sapply(midnight, function(x) x$getText()) | |
midnight_corpus = Corpus(VectorSource(midnight_text)) | |
tdm = TermDocumentMatrix( |
OlderNewer