Skip to content

Instantly share code, notes, and snippets.

@johncalabrese
Last active December 28, 2018 01:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johncalabrese/ee1c5b324b0c61b0b737306d29f62140 to your computer and use it in GitHub Desktop.
Save johncalabrese/ee1c5b324b0c61b0b737306d29f62140 to your computer and use it in GitHub Desktop.
def scrap_webpage():
webpage_url = "WEBPAGE_URL"
page = urlopen(webpage_url)
soup = BeautifulSoup(page, "html.parser")
# search all html lines containing table data
news_row = soup.find_all('tr', {'class': ['nn']})
news = []
for story in news_row:
news.append(story.find('a').contents[0])
df = pd.DataFrame.from_dict(news)
def dataframe_sum_words(words, dataframe):
summary = []
for each in words:
count = df[0].str.count('\\b'+each+'\\b', re.I).sum()
summary.append([str(each), str(count)])
summary_df = pd.DataFrame.from_records(summary, columns=['word','count'])
return summary_df
summary = dataframe_sum_words([ENTER_WORDS_OF_INTEREST], df)
write_data = summary.to_csv(index=False)
return write_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment