Last active
July 14, 2024 07:16
-
-
Save yeiichi/dfe85f140e5b2c4a40b316a23d5ba274 to your computer and use it in GitHub Desktop.
Extract url strings from the webloc files and save them as a CSV file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import hashlib | |
# noinspection PyPep8Naming | |
import xml.etree.ElementTree as ET | |
from datetime import datetime | |
from pathlib import Path | |
from time import sleep | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from htmldate import find_date | |
from file_utils2 import find_files, save_df_as_csv | |
CWD = Path('.') | |
def check_structure(webloc_fpath): | |
"""Check if the right elements are extracted. | |
Args: | |
webloc_fpath (path-like): Path to the webloc file. | |
""" | |
root = ET.parse(webloc_fpath).getroot() | |
for i, elems in enumerate(root): | |
for j, elem in enumerate(elems): | |
print(f'root[{i}][{j}] {elems} {elem}: {elem.text}') | |
def extract_url_fm_webloc(webloc_fpath): | |
"""Extract a url string from the webloc file. | |
""" | |
# root <xml.etree.ElementTree.Element> | |
root = ET.parse(webloc_fpath).getroot() # <xml.etree.ElementTree.Element> | |
return root[0][1].text # A URL string | |
def download_a_page(url): | |
# User Agents Set up. | |
ua_dict = { | |
'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) ' | |
'AppleWebKit/605.1.15 (KHTML, like Gecko) ' | |
'Version/14.0.2 Safari/605.1.15', | |
'firefox': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:85.0) ' | |
'Gecko/20100101 Firefox/85.0', | |
'chrome': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/81.0.4044.138 Safari/537.36'} | |
user_agent = 'chrome' | |
my_headers = {'user-agent': ua_dict.get(user_agent.lower())} | |
# Download | |
res = requests.get(url, headers=my_headers, timeout=(3.05, 27)) | |
res.encoding = res.apparent_encoding | |
return res | |
def main(): | |
"""Extract url strings from the webloc files | |
and save them as a CSV file. | |
""" | |
# Load source webloc file. | |
src_dir = input('source dir? >> ') # A DIR containing webloc files. | |
weblocs = find_files('*.webloc', src_dir) | |
print(f'Found {len(weblocs)} webloc files') | |
# Extract URLs from the source. | |
result_list = [['md5_url', 'url', 'title', 'date(may differ)', 'err_msg']] | |
for webloc in weblocs: | |
sleep(1) | |
print(f'Processing: {webloc.name[:32]}...') | |
err_msg = [] | |
url = extract_url_fm_webloc(webloc) # <str> | |
res = download_a_page(url) | |
try: | |
soup = BeautifulSoup(res.text, 'lxml') | |
except Exception as e: | |
err_msg.append(f'SOUP::{str(e)}') | |
# noinspection PyArgumentEqualDefault | |
soup = BeautifulSoup('', 'lxml') # Fallback | |
# Items | |
title = soup.find('title').string.strip() if soup.find('title') else 'not found' | |
# noinspection PyArgumentEqualDefault | |
md5 = hashlib.md5(url.encode('utf-8')).hexdigest() | |
# Estimated dttm of publication. | |
try: | |
est_dt = find_date(url) | |
except ValueError as e: | |
err_msg.append(f'FIND_DATE::{str(e)}') | |
est_dt = 'not found' | |
# Results | |
result_list.append([md5, url, title, est_dt, ';'.join(err_msg)]) | |
# DataFrame for the output | |
df = pd.DataFrame(result_list[1:], columns=result_list[0]) | |
# Save as a CSV file. | |
fname = f'urls_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv' | |
fpath = CWD / fname | |
save_df_as_csv(df, fpath) | |
if __name__ == '__main__': | |
main() | |
# Tester | |
# check_structure('Path/to/foobar.webloc') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment