TomoG29/Blog_ImportHTMLBySelenium.js

## Blog_ImportHTMLBySelenium.js
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import getpass
import csv
import time

# フォルダパス
html_track_dir = os.path.join(HTMLが保存されているフォルダ)
html_analysis_dir = os.path.join(解析結果を保存するフォルダ)

# 指定のクラス名とタグ名
ANALYSIS_CLASS = '解析したいクラス'
TARGET_TAG = '解析したいタグ'

# 処理の待機時間
SLEEP_TIME = 2

# 指定のクラス内にある指定タグのデータを抽出する関数
def extract_data(driver):
    extracted_data = []  # データを蓄積するリスト
    elements = driver.find_elements(By.CLASS_NAME, ANALYSIS_CLASS)
    for element in elements:
        tags = element.find_elements(By.TAG_NAME, TARGET_TAG)
        for tag in tags:
            title = tag.text.strip()
            url = tag.get_attribute('href')
            extracted_data.append((title, url))
    return extracted_data

# CSVファイルに書き込む関数
def write_to_csv(data):
    if not data:
        return  # データが空の場合は何もしない
    file_path = os.path.join(html_analysis_dir, 'analysis.csv')

    with open(file_path, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for title, url in data:
            writer.writerow([f"タイトル={title}", f"URL={url}"])
            writer.writerow([])  # 空行を挿入

# メイン関数
def main():
    extracted_data = []  # データを蓄積するリスト

    # SeleniumのWebDriverを初期化
    with webdriver.Chrome() as driver:
        # html_track_dirフォルダ内の全てのファイルとフォルダを処理
        for root, dirs, files in os.walk(html_track_dir):
            for filename in files:
                if filename.endswith('.html'):
                    file_path = os.path.join(root, filename)
                    # ファイルのURLを取得
                    file_url = 'file:///' + file_path.replace('\\', '/')
                    # WebDriverでHTMLファイルを開く前に待機
                    time.sleep(SLEEP_TIME)
                    # WebDriverでHTMLファイルを開く
                    driver.get(file_url)
                    # 指定のクラス内にある指定タグのデータを抽出し、リストに追加
                    extracted_data.extend(extract_data(driver))

    # 全てのファイルの処理が終了した後に、CSVファイルに書き込む
    write_to_csv(extracted_data)
    print("プログラム終了")

if __name__ == "__main__":
    main()
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import os
	import getpass
	import csv
	import time

	# フォルダパス
	html_track_dir = os.path.join(HTMLが保存されているフォルダ)
	html_analysis_dir = os.path.join(解析結果を保存するフォルダ)

	# 指定のクラス名とタグ名
	ANALYSIS_CLASS = '解析したいクラス'
	TARGET_TAG = '解析したいタグ'

	# 処理の待機時間
	SLEEP_TIME = 2

	# 指定のクラス内にある指定タグのデータを抽出する関数
	def extract_data(driver):
	extracted_data = [] # データを蓄積するリスト
	elements = driver.find_elements(By.CLASS_NAME, ANALYSIS_CLASS)
	for element in elements:
	tags = element.find_elements(By.TAG_NAME, TARGET_TAG)
	for tag in tags:
	title = tag.text.strip()
	url = tag.get_attribute('href')
	extracted_data.append((title, url))
	return extracted_data

	# CSVファイルに書き込む関数
	def write_to_csv(data):
	if not data:
	return # データが空の場合は何もしない
	file_path = os.path.join(html_analysis_dir, 'analysis.csv')

	with open(file_path, 'w', newline='', encoding='utf-8') as file:
	writer = csv.writer(file)
	for title, url in data:
	writer.writerow([f"タイトル={title}", f"URL={url}"])
	writer.writerow([]) # 空行を挿入

	# メイン関数
	def main():
	extracted_data = [] # データを蓄積するリスト

	# SeleniumのWebDriverを初期化
	with webdriver.Chrome() as driver:
	# html_track_dirフォルダ内の全てのファイルとフォルダを処理
	for root, dirs, files in os.walk(html_track_dir):
	for filename in files:
	if filename.endswith('.html'):
	file_path = os.path.join(root, filename)
	# ファイルのURLを取得
	file_url = 'file:///' + file_path.replace('\\', '/')
	# WebDriverでHTMLファイルを開く前に待機
	time.sleep(SLEEP_TIME)
	# WebDriverでHTMLファイルを開く
	driver.get(file_url)
	# 指定のクラス内にある指定タグのデータを抽出し、リストに追加
	extracted_data.extend(extract_data(driver))

	# 全てのファイルの処理が終了した後に、CSVファイルに書き込む
	write_to_csv(extracted_data)
	print("プログラム終了")

	if __name__ == "__main__":
	main()