RuizSerra/redact-student-names.py

## redact-student-names.py
from bs4 import BeautifulSoup
import re
import os
import zipfile

def redact_names(html_filename, OUTPUT_DIR='.'):

    ## Load html file
    with open(html_filename, 'r') as f:
        txt = f.read()

    soup = BeautifulSoup(txt, 'html.parser')

    # Change path references
    for tag in soup.find_all(href=True):
        if tag['href'].startswith('../assets/'):
            tag['href'] = './' + tag['href'][3:]

    for tag in soup.find_all(src=True):
        if tag['src'].startswith('../assets/'):
            tag['src'] = './' + tag['src'][3:]

    ## Redact student names from title
    title = soup.find('title')
    title.string = re.sub(r'^.* x .* : ', '', title.text)

    ## Get student names
    student_left = soup.select_one('.split .header')
    student_left_original = student_left.string

    student_right = soup.select_one('.split-right .header')
    student_right_original = student_right.string

    ## Redact student left
    student_left.string = 'OTHER STUDENT'
    # Save to directory
    unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_right_original)
    directory = unikey
    if not os.path.exists(directory):
        os.makedirs(directory)
    # with filename
    out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
    with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
        f.write(str(soup))
    print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')

    ## Redact student right
    student_left.string = student_left_original
    student_right.string = 'OTHER STUDENT'
    # Save to directory
    unikey = re.sub(r'^.*- (.*)@.*', '\g<1>', student_left_original)
    directory = unikey
    if not os.path.exists(directory):
        os.makedirs(directory)
    # with filename
    out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
    with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
        f.write(str(soup))
    print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')

    ## Reset
    student_right.string = student_right_original


# ----------------------------------------------------------------------
INPUT_BASE_DIR = '../path-of-dir'
input_dirs = [
    os.path.join(INPUT_BASE_DIR, d) for d in os.listdir(INPUT_BASE_DIR)
    if re.match('^[a-z]{4}\d{4}-', d)
]

for input_dir in input_dirs:
    for filename in os.listdir(input_dir):
        if filename.endswith('.html'):
            html_filename = os.path.join(input_dir, filename)
            print('Input: ', html_filename)
            redact_names(html_filename)

filtered_list = [d for d in os.listdir('.') if re.match(r'^[a-z]{4}[\d]{4}$', d)]

for directory in filtered_list:
    zip_filename = f'{directory}-files.zip'

    with zipfile.ZipFile(zip_filename, 'w') as zipf:

        # Copy HTML files into zip file
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                print(file_path)
                zipf.write(file_path, os.path.relpath(file_path, directory))

        # Copy assets into zip file
        for root, dirs, files in os.walk('assets'):
            for file in files:
                file_path = os.path.join(root, file)
                print(file_path)
                zipf.write(file_path, os.path.relpath(file_path, '.'))
	from bs4 import BeautifulSoup
	import re
	import os
	import zipfile

	def redact_names(html_filename, OUTPUT_DIR='.'):

	## Load html file
	with open(html_filename, 'r') as f:
	txt = f.read()

	soup = BeautifulSoup(txt, 'html.parser')

	# Change path references
	for tag in soup.find_all(href=True):
	if tag['href'].startswith('../assets/'):
	tag['href'] = './' + tag['href'][3:]

	for tag in soup.find_all(src=True):
	if tag['src'].startswith('../assets/'):
	tag['src'] = './' + tag['src'][3:]

	## Redact student names from title
	title = soup.find('title')
	title.string = re.sub(r'^.* x .* : ', '', title.text)

	## Get student names
	student_left = soup.select_one('.split .header')
	student_left_original = student_left.string

	student_right = soup.select_one('.split-right .header')
	student_right_original = student_right.string

	## Redact student left
	student_left.string = 'OTHER STUDENT'
	# Save to directory
	unikey = re.sub(r'^.- (.)@.*', '\g<1>', student_right_original)
	directory = unikey
	if not os.path.exists(directory):
	os.makedirs(directory)
	# with filename
	out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
	with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
	f.write(str(soup))
	print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')

	## Redact student right
	student_left.string = student_left_original
	student_right.string = 'OTHER STUDENT'
	# Save to directory
	unikey = re.sub(r'^.- (.)@.*', '\g<1>', student_left_original)
	directory = unikey
	if not os.path.exists(directory):
	os.makedirs(directory)
	# with filename
	out_filename = re.sub('.*(\.q.\.html)', f'{unikey}\g<1>', html_filename)
	with open(f'{OUTPUT_DIR}/{directory}/{out_filename}', 'w') as f:
	f.write(str(soup))
	print('Written: ', f'{OUTPUT_DIR}/{directory}/{out_filename}')

	## Reset
	student_right.string = student_right_original


	# ----------------------------------------------------------------------
	INPUT_BASE_DIR = '../path-of-dir'
	input_dirs = [
	os.path.join(INPUT_BASE_DIR, d) for d in os.listdir(INPUT_BASE_DIR)
	if re.match('^[a-z]{4}\d{4}-', d)
	]

	for input_dir in input_dirs:
	for filename in os.listdir(input_dir):
	if filename.endswith('.html'):
	html_filename = os.path.join(input_dir, filename)
	print('Input: ', html_filename)
	redact_names(html_filename)

	filtered_list = [d for d in os.listdir('.') if re.match(r'^[a-z]{4}[\d]{4}$', d)]

	for directory in filtered_list:
	zip_filename = f'{directory}-files.zip'

	with zipfile.ZipFile(zip_filename, 'w') as zipf:

	# Copy HTML files into zip file
	for root, dirs, files in os.walk(directory):
	for file in files:
	file_path = os.path.join(root, file)
	print(file_path)
	zipf.write(file_path, os.path.relpath(file_path, directory))

	# Copy assets into zip file
	for root, dirs, files in os.walk('assets'):
	for file in files:
	file_path = os.path.join(root, file)
	print(file_path)
	zipf.write(file_path, os.path.relpath(file_path, '.'))