Skip to content

Instantly share code, notes, and snippets.

@ymotongpoo
Created September 21, 2018 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ymotongpoo/a994268a8daed56e0973a45298604a26 to your computer and use it in GitHub Desktop.
Save ymotongpoo/a994268a8daed56e0973a45298604a26 to your computer and use it in GitHub Desktop.
sample scraping script
import requests
import bs4
import time
base_url_format = "https://b-name.jp/赤ちゃん名前辞典/all/{}/?p={}"
def start_request():
data = []
hiragana = [chr(i) for i in range(ord('あ'), ord('ん')+1)]
for h in hiragana:
i = 1
n = 0
while(True):
url = base_url_format.format(h, i)
resp = requests.get(url)
#print(url)
soup = bs4.BeautifulSoup(resp.content, 'html.parser')
namelist = soup.find(class_='namelist')
trs = namelist.find_all('tr')
if len(trs) == 1:
break
for tr in trs:
cellname = tr.find(class_='cell-name')
cellyomi = tr.find(class_='cell-yomi')
gender = None
if tr.find(class_='icon-woman') is not None:
gender = '女'
elif tr.find(class_='icon-man') is not None:
gender = '男'
if cellname is not None:
print(n, gender, cellname.string, cellyomi.string)
n += 1
time.sleep(5)
i += 1
def main():
start_request()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment