Skip to content

Instantly share code, notes, and snippets.

@chuck0523
Last active April 15, 2018 06:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chuck0523/2570ca8ae4ce3a00007c1fe1ae19c0d1 to your computer and use it in GitHub Desktop.
Save chuck0523/2570ca8ae4ce3a00007c1fe1ae19c0d1 to your computer and use it in GitHub Desktop.
任意のはてなブログを指定して、特定のワードの登場回数を調べる
# coding: UTF-8
import urllib2
# "pip install beautifulsoup4" needs to be done
from bs4 import BeautifulSoup
# const
baseUrl = "ここにブログURL"
url = baseUrl + "/archive"
linkClass = "hatena-star-permalink"
entryClass = "entry-content"
searchWord = "本文中から検索したいワード"
# let
count = 0
# func
def getSoup (url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
return soup
# Main
print "[" + baseUrl + "]の本文中から[" + searchWord + "]の登場回数を検索します…\n"
while url:
soup = getSoup(url)
links = soup.find_all("a", class_=linkClass)
# Interation for one archive page
for link in links:
page = getSoup(link.get('href'))
entry = page.find(attrs={"class": entryClass})
# Interation for one page content
for content in entry.contents:
# content.string is NavigableString class. For string-matching, it should be passed to unicode function
text = unicode(content.string)
if searchWord in text:
# Show head of text
# print text[0:30] + "...\n"
# here's actual part to work on matched text
count += 1
try:
nextPage = soup.find(attrs={"class": "pager-next"}).a.get('href')
url = nextPage
except:
# link to next page is not found, which means end of iteration through Blog archives
print "[" + baseUrl + "]の中で[" + searchWord + "]は" + str(count) + "回登場しました"
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment