Skip to content

Instantly share code, notes, and snippets.

@daddiofaddio
Last active May 23, 2024 09:15
Show Gist options
  • Save daddiofaddio/d8215d76e07ed6d1a9bb03ddc005426a to your computer and use it in GitHub Desktop.
Save daddiofaddio/d8215d76e07ed6d1a9bb03ddc005426a to your computer and use it in GitHub Desktop.
Python script - Rename extracted json data files to match sanitized names of urls scraped; regex for batch renaming of suffixes, specific first names, etc.
import os
import re
import json
# Directory containing the JSON files
json_dir = '/path/to/dir'
urls_file_path = '/path/to/urls.txt'
with open(urls_file_path, 'r') as file:
urls = [url.strip() for url in file.readlines() if url.strip()]
# Read URLs from urls.txt
def is_roman_numeral(s):
"""Checks if a string is a Roman numeral."""
return bool(re.match(r'^(I{1,3}|II|III|IV|V|VI|VII|VIII|IX|X)$', s))
def sanitize_for_filename(url):
"""Converts URL to a filename by removing 'https://' and replacing '/' with '_'."""
return url.replace('https://www.example.com/', 'www.example_com').replace('/', '_') + '.json'
def find_matching_url(name):
"""Finds a URL matching the given name."""
# Prepare the name for pattern matching
pattern = re.compile(r'\b' + re.escape(name) + r'\b', re.IGNORECASE)
for url in urls:
if pattern.search(url):
return url
return None
def adjust_name(name):
"""Adjusts the name for matching, taking care of initials, suffixes, and 'Ann' or 'Anne'."""
name_parts = name.split()
# Initial handling
if len(name_parts[0]) == 2 and name_parts[0].endswith('.'):
name_parts[0] = name_parts[0][0] # Remove period
# Handle suffixes like 'Jr.', 'Sr.', or Roman numerals
if name_parts[-1].lower() in ['jr', 'jr.', 'sr', 'sr.'] or is_roman_numeral(name_parts[-1]):
name_parts = name_parts[:-1] # Remove the suffix
# Handle 'Ann' or 'Anne' cases
if name_parts[0].lower() in ['ann', 'anne']:
name_parts[0] = 'a' # Use 'a' as the shorthand for 'Ann' or 'Anne'
# Rejoin the name parts for the final name adjustment
adjusted_name = ' '.join(name_parts)
return adjusted_name
# Main renaming logic
for filename in os.listdir(json_dir):
if filename.endswith('.json'):
file_path = os.path.join(json_dir, filename)
with open(file_path, 'r') as f:
data = json.load(f)
name = data.get('name', '')
adjusted_name = adjust_name(name)
matching_url = find_matching_url(adjusted_name)
if matching_url:
new_filename = sanitize_for_filename(matching_url)
new_file_path = os.path.join(json_dir, new_filename)
os.rename(file_path, new_file_path)
print(f'Renamed "{filename}" to "{new_filename}"')
else:
print(f'No matching URL found for "{name}"')
def rename_files_to_final_format(directory):
"""
Renames files from formats like '98104-wa-robert-mitchell-10136.html.json' and
'ernest-matthews-8046.html.json' to 'www.example.com_url_string_98104-wa-robert-mitchell-10136.html.json'
and 'www.example.com_url_string_ernest-matthews-8046.html.json'.
"""
# Updated pattern to match both formats
pattern = re.compile(r'^(?:\d{5}-[a-z]{2}-)?[a-z-]+-\d+\.html\.json$', re.I)
for filename in os.listdir(directory):
if pattern.match(filename):
# Construct new filename format
new_filename = f'www.example.com_url_string_{filename}'
os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
print(f'Renamed "{filename}" to "{new_filename}"')
# Call the function at the end of your script
rename_files_to_final_format(json_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment