johncalabrese/webscrap_public.py

## webscrap_public.py
def scrap_webpage():
    webpage_url = "WEBPAGE_URL"
    page = urlopen(webpage_url)
    soup = BeautifulSoup(page, "html.parser")
    # search all html lines containing table data
    news_row = soup.find_all('tr', {'class': ['nn']})
    news = []
    for story in news_row:
        news.append(story.find('a').contents[0])

    df = pd.DataFrame.from_dict(news)

    def dataframe_sum_words(words, dataframe):
        summary = []
        for each in words:
        count = df[0].str.count('\\b'+each+'\\b', re.I).sum()
        summary.append([str(each), str(count)])
        summary_df = pd.DataFrame.from_records(summary, columns=['word','count'])
        return summary_df

    summary = dataframe_sum_words([ENTER_WORDS_OF_INTEREST], df)
    write_data = summary.to_csv(index=False)

    return write_data
	def scrap_webpage():
	webpage_url = "WEBPAGE_URL"
	page = urlopen(webpage_url)
	soup = BeautifulSoup(page, "html.parser")
	# search all html lines containing table data
	news_row = soup.find_all('tr', {'class': ['nn']})
	news = []
	for story in news_row:
	news.append(story.find('a').contents[0])

	df = pd.DataFrame.from_dict(news)

	def dataframe_sum_words(words, dataframe):
	summary = []
	for each in words:
	count = df[0].str.count('\\b'+each+'\\b', re.I).sum()
	summary.append([str(each), str(count)])
	summary_df = pd.DataFrame.from_records(summary, columns=['word','count'])
	return summary_df

	summary = dataframe_sum_words([ENTER_WORDS_OF_INTEREST], df)
	write_data = summary.to_csv(index=False)

	return write_data