Skip to content

Instantly share code, notes, and snippets.

@kakarukeys
Last active May 12, 2021 06:59
Show Gist options
  • Save kakarukeys/f93d59f77ba7211bac8bea3abc5cf462 to your computer and use it in GitHub Desktop.
Save kakarukeys/f93d59f77ba7211bac8bea3abc5cf462 to your computer and use it in GitHub Desktop.
requests crawler (tutorial 1)
import time
import zlib
from io import BytesIO
from zipfile import ZipFile
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:88.0) Gecko/20100101 Firefox/88.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
POST_HEADERS = HEADERS.copy()
POST_HEADERS.update({
"Content-Type": "application/x-www-form-urlencoded",
"Referer": "https://www.houjin-bangou.nta.go.jp/download/sabun/",
})
TOKEN_NAME="jp.go.nta.houjin_bangou.framework.web.common.CNSFWTokenProcessor.request.token"
if __name__ == "__main__":
with requests.Session() as session:
resp = requests.get("https://www.houjin-bangou.nta.go.jp/download/sabun/", headers=HEADERS)
time.sleep(5)
soup = BeautifulSoup(resp.content, features="html.parser")
token_value = soup.select(f'#appForm input[name="{TOKEN_NAME}"]')[0]["value"]
dl_file_nums = [el["onclick"].lstrip("return doDownload(").rstrip(");") for el in soup.select('#appForm a[onclick^="return doDownload"]')]
for file_num in dl_file_nums:
data = {
TOKEN_NAME: token_value,
"event": "download",
"selDlFileNo": file_num,
}
with session.post("https://www.houjin-bangou.nta.go.jp/download/sabun/index.html", data=data, headers=POST_HEADERS, stream=True) as resp:
time.sleep(5)
zipfile = ZipFile(BytesIO(resp.content))
csv_file_name = [n for n in zipfile.namelist() if n.endswith(".csv")][0]
for line in zipfile.open(csv_file_name).readlines():
print(line.decode("Shift_JIS"))
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment