Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active July 14, 2024 07:16
Show Gist options
  • Save yeiichi/dfe85f140e5b2c4a40b316a23d5ba274 to your computer and use it in GitHub Desktop.
Save yeiichi/dfe85f140e5b2c4a40b316a23d5ba274 to your computer and use it in GitHub Desktop.
Extract url strings from the webloc files and save them as a CSV file.
#!/usr/bin/env python3
import hashlib
# noinspection PyPep8Naming
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from time import sleep
import pandas as pd
import requests
from bs4 import BeautifulSoup
from htmldate import find_date
from file_utils2 import find_files, save_df_as_csv
CWD = Path('.')
def check_structure(webloc_fpath):
"""Check if the right elements are extracted.
Args:
webloc_fpath (path-like): Path to the webloc file.
"""
root = ET.parse(webloc_fpath).getroot()
for i, elems in enumerate(root):
for j, elem in enumerate(elems):
print(f'root[{i}][{j}] {elems} {elem}: {elem.text}')
def extract_url_fm_webloc(webloc_fpath):
"""Extract a url string from the webloc file.
"""
# root <xml.etree.ElementTree.Element>
root = ET.parse(webloc_fpath).getroot() # <xml.etree.ElementTree.Element>
return root[0][1].text # A URL string
def download_a_page(url):
# User Agents Set up.
ua_dict = {
'safari': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) '
'AppleWebKit/605.1.15 (KHTML, like Gecko) '
'Version/14.0.2 Safari/605.1.15',
'firefox': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:85.0) '
'Gecko/20100101 Firefox/85.0',
'chrome': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/81.0.4044.138 Safari/537.36'}
user_agent = 'chrome'
my_headers = {'user-agent': ua_dict.get(user_agent.lower())}
# Download
res = requests.get(url, headers=my_headers, timeout=(3.05, 27))
res.encoding = res.apparent_encoding
return res
def main():
"""Extract url strings from the webloc files
and save them as a CSV file.
"""
# Load source webloc file.
src_dir = input('source dir? >> ') # A DIR containing webloc files.
weblocs = find_files('*.webloc', src_dir)
print(f'Found {len(weblocs)} webloc files')
# Extract URLs from the source.
result_list = [['md5_url', 'url', 'title', 'date(may differ)', 'err_msg']]
for webloc in weblocs:
sleep(1)
print(f'Processing: {webloc.name[:32]}...')
err_msg = []
url = extract_url_fm_webloc(webloc) # <str>
res = download_a_page(url)
try:
soup = BeautifulSoup(res.text, 'lxml')
except Exception as e:
err_msg.append(f'SOUP::{str(e)}')
# noinspection PyArgumentEqualDefault
soup = BeautifulSoup('', 'lxml') # Fallback
# Items
title = soup.find('title').string.strip() if soup.find('title') else 'not found'
# noinspection PyArgumentEqualDefault
md5 = hashlib.md5(url.encode('utf-8')).hexdigest()
# Estimated dttm of publication.
try:
est_dt = find_date(url)
except ValueError as e:
err_msg.append(f'FIND_DATE::{str(e)}')
est_dt = 'not found'
# Results
result_list.append([md5, url, title, est_dt, ';'.join(err_msg)])
# DataFrame for the output
df = pd.DataFrame(result_list[1:], columns=result_list[0])
# Save as a CSV file.
fname = f'urls_{datetime.now().strftime("%Y%m%d%H%M%S")}.csv'
fpath = CWD / fname
save_df_as_csv(df, fpath)
if __name__ == '__main__':
main()
# Tester
# check_structure('Path/to/foobar.webloc')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment