Skip to content

Instantly share code, notes, and snippets.

@RuizSerra
Created March 21, 2024 06:31
Show Gist options
  • Save RuizSerra/d729a96b9e0e699daa42555138ad212c to your computer and use it in GitHub Desktop.
Save RuizSerra/d729a96b9e0e699daa42555138ad212c to your computer and use it in GitHub Desktop.
To redact student names from `.html` comparison files
from bs4 import BeautifulSoup
import re
import os
import zipfile
def redact_names(html_filename, OUTPUT_DIR='.'):
## Load html file
with open(html_filename, 'r') as f:
txt = f.read()
soup = BeautifulSoup(txt, 'html.parser')
# Change path references
for tag in soup.find_all(href=True):
if tag['href'].startswith('../assets/'):
tag['href'] = './' + tag['href'][3:]
for tag in soup.find_all(src=True):
if tag['src'].startswith('../assets/'):
tag['src'] = './' + tag['src'][3:]
## Redact student names from title
title = soup.find('title')
title.string = re.sub(r'^.* x .* : ', '', title.text)
## Get student names
student_left = soup.select_one('.split .header')
student_left_original = student_left.string
student_right = soup.select_one('.split-right .header')
student_right_original = student_right.string
## Redact student left
student_left.string = 'OTHER STUDENT'
# Save to directory
unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_right_original)
directory = unikey
if not os.path.exists(directory):
os.makedirs(directory)
# with filename
out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
f.write(str(soup))
print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')
## Redact student right
student_left.string = student_left_original
student_right.string = 'OTHER STUDENT'
# Save to directory
unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_left_original)
directory = unikey
if not os.path.exists(directory):
os.makedirs(directory)
# with filename
out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
f.write(str(soup))
print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')
## Reset
student_right.string = student_right_original
# ----------------------------------------------------------------------
INPUT_BASE_DIR = '../path-of-dir'
input_dirs = [
os.path.join(INPUT_BASE_DIR, d) for d in os.listdir(INPUT_BASE_DIR)
if re.match('^[a-z]{4}\d{4}-', d)
]
for input_dir in input_dirs:
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
html_filename = os.path.join(input_dir, filename)
print('Input: ', html_filename)
redact_names(html_filename)
filtered_list = [d for d in os.listdir('.') if re.match(r'^[a-z]{4}[\d]{4}$', d)]
for directory in filtered_list:
zip_filename = f'{directory}-files.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
# Copy HTML files into zip file
for root, dirs, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
print(file_path)
zipf.write(file_path, os.path.relpath(file_path, directory))
# Copy assets into zip file
for root, dirs, files in os.walk('assets'):
for file in files:
file_path = os.path.join(root, file)
print(file_path)
zipf.write(file_path, os.path.relpath(file_path, '.'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment