Last active
May 23, 2024 09:15
-
-
Save daddiofaddio/d8215d76e07ed6d1a9bb03ddc005426a to your computer and use it in GitHub Desktop.
Python script - Rename extracted json data files to match sanitized names of urls scraped; regex for batch renaming of suffixes, specific first names, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import json | |
# Directory containing the JSON files | |
json_dir = '/path/to/dir' | |
urls_file_path = '/path/to/urls.txt' | |
with open(urls_file_path, 'r') as file: | |
urls = [url.strip() for url in file.readlines() if url.strip()] | |
# Read URLs from urls.txt | |
def is_roman_numeral(s): | |
"""Checks if a string is a Roman numeral.""" | |
return bool(re.match(r'^(I{1,3}|II|III|IV|V|VI|VII|VIII|IX|X)$', s)) | |
def sanitize_for_filename(url): | |
"""Converts URL to a filename by removing 'https://' and replacing '/' with '_'.""" | |
return url.replace('https://www.example.com/', 'www.example_com').replace('/', '_') + '.json' | |
def find_matching_url(name): | |
"""Finds a URL matching the given name.""" | |
# Prepare the name for pattern matching | |
pattern = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE) | |
for url in urls: | |
if pattern.search(url): | |
return url | |
return None | |
def adjust_name(name): | |
"""Adjusts the name for matching, taking care of initials, suffixes, and 'Ann' or 'Anne'.""" | |
name_parts = name.split() | |
# Initial handling | |
if len(name_parts[0]) == 2 and name_parts[0].endswith('.'): | |
name_parts[0] = name_parts[0][0] # Remove period | |
# Handle suffixes like 'Jr.', 'Sr.', or Roman numerals | |
if name_parts[-1].lower() in ['jr', 'jr.', 'sr', 'sr.'] or is_roman_numeral(name_parts[-1]): | |
name_parts = name_parts[:-1] # Remove the suffix | |
# Handle 'Ann' or 'Anne' cases | |
if name_parts[0].lower() in ['ann', 'anne']: | |
name_parts[0] = 'a' # Use 'a' as the shorthand for 'Ann' or 'Anne' | |
# Rejoin the name parts for the final name adjustment | |
adjusted_name = ' '.join(name_parts) | |
return adjusted_name | |
# Main renaming logic | |
for filename in os.listdir(json_dir): | |
if filename.endswith('.json'): | |
file_path = os.path.join(json_dir, filename) | |
with open(file_path, 'r') as f: | |
data = json.load(f) | |
name = data.get('name', '') | |
adjusted_name = adjust_name(name) | |
matching_url = find_matching_url(adjusted_name) | |
if matching_url: | |
new_filename = sanitize_for_filename(matching_url) | |
new_file_path = os.path.join(json_dir, new_filename) | |
os.rename(file_path, new_file_path) | |
print(f'Renamed "{filename}" to "{new_filename}"') | |
else: | |
print(f'No matching URL found for "{name}"') | |
def rename_files_to_final_format(directory): | |
""" | |
Renames files from formats like '98104-wa-robert-mitchell-10136.html.json' and | |
'ernest-matthews-8046.html.json' to 'www.example.com_url_string_98104-wa-robert-mitchell-10136.html.json' | |
and 'www.example.com_url_string_ernest-matthews-8046.html.json'. | |
""" | |
# Updated pattern to match both formats | |
pattern = re.compile(r'^(?:\d{5}-[a-z]{2}-)?[a-z-]+-\d+\.html\.json$', re.I) | |
for filename in os.listdir(directory): | |
if pattern.match(filename): | |
# Construct new filename format | |
new_filename = f'www.example.com_url_string_{filename}' | |
os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename)) | |
print(f'Renamed "{filename}" to "{new_filename}"') | |
# Call the function at the end of your script | |
rename_files_to_final_format(json_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment