-
-
Save AkiyonKS/db5e92eb1c0ead23f963d2f34d629121 to your computer and use it in GitHub Desktop.
extract raillab text using Beautiful Soup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import requests | |
import os | |
from bs4 import BeautifulSoup | |
import neologdn | |
import re | |
# 作業ディレクトリを確認 | |
path = os.getcwd() | |
print(path) | |
# 最後のページ数を指定 | |
last_page = 6518 | |
pages = list(range(1, last_page + 1)) | |
print(pages) | |
# スクレイピングしたデータを保存するデータフレームを作成 | |
data = {"photo_id":[], "src": [], "alt": []} | |
df_all = pd.DataFrame(data) | |
data2 = {"photo_id" : [], "title":[], "value": []} | |
df_all2 = pd.DataFrame(data2) | |
# for文で繰り返し処理(applyでは動かなかった) | |
for page in pages: | |
# 読み込むurlを設定 | |
purl = "https://raillab.jp/photos/list?photo_type=all&sort=new&info_type=max&pageid=" | |
url = purl + str(page) | |
print(url) | |
# GETリクエストを実行 | |
r = requests.get(url) | |
# Beautiful Soupで抽出開始 | |
soup = BeautifulSoup(r.text, "html5lib") | |
# df_allにsrcとaltの情報を保存 | |
imgs = soup.select("li.photo-list-max__item > div.row > div.col-md-6:nth-child(1) > a > img") | |
list_src = list(map(lambda x: x.get('src'), imgs)) | |
list_alt = list(map(lambda x: x.get('alt'), imgs)) | |
data = {"src": list_src, "alt": list_alt} | |
df = pd.DataFrame(data) | |
df_all = pd.concat([df_all, df], axis=0) | |
# df_all2にth(title)とtd(車両に関する複数の情報)を保存 | |
str1 = "li.photo-list-max__item > div.row > div.col-md-6:nth-child(2) > table > tbody > tr > " | |
titles = soup.select(str1 + "th") | |
list_titles = list(map(lambda x: x.text, titles)) | |
values = soup.select(str1 + "td") | |
list_values = list(map(lambda x: re.sub("\n", "", neologdn.normalize(x.text)), values)) | |
data2 = {"title": list_titles, "value": list_values} | |
df2 = pd.DataFrame(data2) | |
df_all2 = pd.concat([df_all2, df2], axis=0) | |
# df_allに連番でphoto_idを設定 | |
df_all["photo_id"] = list(range(len(df_all))) | |
# df_all2は1つのphoto_idの情報が複数行にわたっているため、1行になるように整形 | |
# photo_id情報をdf_all2に追加する | |
nums = [i for i, x in enumerate(df_all2['title']) if x == '鉄道会社'] | |
nums2 = nums + [len(df_all2)] | |
for i in range(len(nums2)-1): | |
df_all2["photo_id"][nums2[i]:nums2[i+1]] = i | |
# pivotで縦持ち型から横持ち型へ変換 | |
df_all2_r = df_all2.pivot(index="photo_id", columns="title", values="value") | |
# 2つのデータフレームの情報を1つに統合(外部結合) | |
df_last = pd.merge(df_all, df_all2_r, on="photo_id", how="outer") | |
# ファイルに保存 | |
df_last.to_csv("../csv/trains.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment