chuck0523/hatenablog-specific-word-counter.py

## hatenablog-specific-word-counter.py
# coding: UTF-8
import urllib2
# "pip install beautifulsoup4" needs to be done
from bs4 import BeautifulSoup

# const
baseUrl = "ここにブログURL"
url = baseUrl + "/archive"
linkClass = "hatena-star-permalink"
entryClass = "entry-content"
searchWord = "本文中から検索したいワード"

# let
count = 0

# func
def getSoup (url):
    html = urllib2.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    return soup

# Main
print "[" + baseUrl + "]の本文中から[" + searchWord + "]の登場回数を検索します…\n"
while url:
    soup = getSoup(url)
    links = soup.find_all("a", class_=linkClass)
    # Interation for one archive page
    for link in links:
        page = getSoup(link.get('href'))
        entry = page.find(attrs={"class": entryClass})
        # Interation for one page content
        for content in entry.contents:
            # content.string is NavigableString class. For string-matching, it should be passed to unicode function
            text = unicode(content.string)
            if searchWord in text:
                # Show head of text
                # print text[0:30] + "...\n"
                # here's actual part to work on matched text
                count += 1
    try:
        nextPage = soup.find(attrs={"class": "pager-next"}).a.get('href')
        url = nextPage
    except:
        # link to next page is not found, which means end of iteration through Blog archives
        print "[" + baseUrl + "]の中で[" + searchWord + "]は" + str(count) + "回登場しました"
        break
	# coding: UTF-8
	import urllib2
	# "pip install beautifulsoup4" needs to be done
	from bs4 import BeautifulSoup

	# const
	baseUrl = "ここにブログURL"
	url = baseUrl + "/archive"
	linkClass = "hatena-star-permalink"
	entryClass = "entry-content"
	searchWord = "本文中から検索したいワード"

	# let
	count = 0

	# func
	def getSoup (url):
	html = urllib2.urlopen(url)
	soup = BeautifulSoup(html, "html.parser")
	return soup

	# Main
	print "[" + baseUrl + "]の本文中から[" + searchWord + "]の登場回数を検索します…\n"
	while url:
	soup = getSoup(url)
	links = soup.find_all("a", class_=linkClass)
	# Interation for one archive page
	for link in links:
	page = getSoup(link.get('href'))
	entry = page.find(attrs={"class": entryClass})
	# Interation for one page content
	for content in entry.contents:
	# content.string is NavigableString class. For string-matching, it should be passed to unicode function
	text = unicode(content.string)
	if searchWord in text:
	# Show head of text
	# print text[0:30] + "...\n"
	# here's actual part to work on matched text
	count += 1
	try:
	nextPage = soup.find(attrs={"class": "pager-next"}).a.get('href')
	url = nextPage
	except:
	# link to next page is not found, which means end of iteration through Blog archives
	print "[" + baseUrl + "]の中で[" + searchWord + "]は" + str(count) + "回登場しました"
	break