Skip to content

Instantly share code, notes, and snippets.

@AkiyonKS
Created August 9, 2022 01:59
Show Gist options
  • Save AkiyonKS/db5e92eb1c0ead23f963d2f34d629121 to your computer and use it in GitHub Desktop.
Save AkiyonKS/db5e92eb1c0ead23f963d2f34d629121 to your computer and use it in GitHub Desktop.
extract raillab text using Beautiful Soup
import numpy as np
import pandas as pd
import requests
import os
from bs4 import BeautifulSoup
import neologdn
import re
# 作業ディレクトリを確認
path = os.getcwd()
print(path)
# 最後のページ数を指定
last_page = 6518
pages = list(range(1, last_page + 1))
print(pages)
# スクレイピングしたデータを保存するデータフレームを作成
data = {"photo_id":[], "src": [], "alt": []}
df_all = pd.DataFrame(data)
data2 = {"photo_id" : [], "title":[], "value": []}
df_all2 = pd.DataFrame(data2)
# for文で繰り返し処理(applyでは動かなかった)
for page in pages:
# 読み込むurlを設定
purl = "https://raillab.jp/photos/list?photo_type=all&sort=new&info_type=max&pageid="
url = purl + str(page)
print(url)
# GETリクエストを実行
r = requests.get(url)
# Beautiful Soupで抽出開始
soup = BeautifulSoup(r.text, "html5lib")
  # df_allにsrcとaltの情報を保存
imgs = soup.select("li.photo-list-max__item > div.row > div.col-md-6:nth-child(1) > a > img")
list_src = list(map(lambda x: x.get('src'), imgs))
list_alt = list(map(lambda x: x.get('alt'), imgs))
data = {"src": list_src, "alt": list_alt}
df = pd.DataFrame(data)
df_all = pd.concat([df_all, df], axis=0)
  # df_all2にth(title)とtd(車両に関する複数の情報)を保存
str1 = "li.photo-list-max__item > div.row > div.col-md-6:nth-child(2) > table > tbody > tr > "
titles = soup.select(str1 + "th")
list_titles = list(map(lambda x: x.text, titles))
values = soup.select(str1 + "td")
list_values = list(map(lambda x: re.sub("\n", "", neologdn.normalize(x.text)), values))
data2 = {"title": list_titles, "value": list_values}
df2 = pd.DataFrame(data2)
df_all2 = pd.concat([df_all2, df2], axis=0)
# df_allに連番でphoto_idを設定
df_all["photo_id"] = list(range(len(df_all)))
# df_all2は1つのphoto_idの情報が複数行にわたっているため、1行になるように整形
# photo_id情報をdf_all2に追加する
nums = [i for i, x in enumerate(df_all2['title']) if x == '鉄道会社']
nums2 = nums + [len(df_all2)]
for i in range(len(nums2)-1):
df_all2["photo_id"][nums2[i]:nums2[i+1]] = i
# pivotで縦持ち型から横持ち型へ変換
df_all2_r = df_all2.pivot(index="photo_id", columns="title", values="value")
# 2つのデータフレームの情報を1つに統合(外部結合)
df_last = pd.merge(df_all, df_all2_r, on="photo_id", how="outer")
# ファイルに保存
df_last.to_csv("../csv/trains.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment