Last active
January 5, 2023 17:37
-
-
Save nonchris/3cdf506575069991181f0f265ddb059b to your computer and use it in GitHub Desktop.
Extract notebooks from nested zip-files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env zsh | |
# tool to collect jupyter notebooks from the "download all zip from eCampus" | |
# options | |
# -z: zip folder with extracted content at the end | |
# -p: persitent -> unzipped folder not be deleted | |
# -t [file]: include full content of other zip in target folder (e.g. the tutor zip) | |
# positional arguments (needed to be positioned after optional ones) | |
# arg1: zip to start with | |
# arg2: folder to copy notebooks to | |
# example usage: | |
# ./extract.sh -z -t 05_tutor.zip 07-Exercise.zip extracted-nbs | |
# most basic: | |
# ./extract.sh Blatt-04.zip blatt-04 | |
# Credits: | |
# Chris G. (https://github.com/nonchris/) | |
# function that recruses trough all folders in a given directory | |
# unzips all zip-files and recurses into the extracted directory | |
# collects all jupyter-notebooks on its way | |
# copy those to a specified path and names them after group-number it was found in | |
walk_folder () { # args: directory name to cd into, ID/Name of the team the sub-dirs are from (to name file when copied) | |
# echo "enter: $1" | |
cd "$1" || echo "Can't CD into $1" # enter directory to inspect | |
# Check is dir is empty - break up then | |
if [ -z "$(ls -A .)" ] | |
then | |
echo "$1 is empty" | |
cd .. | |
return | |
fi | |
# loop over all elements in directory | |
for elem in * | |
do | |
# echo " elem: $elem, $(pwd)" | |
# check which team we are in - folders are named "TEAM XXXXX" with X as a digit | |
# we can rely on this when downloading from ecampus, using this param to better identify copied files | |
current_team=$2 | |
if [[ "$elem" == Team* ]] | |
then | |
current_team="${elem#* }" | |
fi | |
# echo "$current_team" TEAM | |
# ignore that stupid folder its just annoying... | |
if [[ "$elem" == "__MACOSX" ]] | |
then continue | |
fi | |
# if we discover a new zip file - extract and dispatch recursion with extracted folder | |
# why? well because extracted folder won't be in elements that this loop here iterates over | |
if [[ "$elem" == *.zip ]] | |
then | |
extract_to="${elem%.zip}" | |
unzip -oq "$elem" -d "$extract_to" && walk_folder "$extract_to" "$current_team" || | |
# zip -FF "$elem" --out "$elem-fixed" -fz && unzip -o "$elem-fixed" && walk_folder "${"$elem-fixed"%.zip}" "$current_team" || | |
echo "ERROR: Can't unzip $elem Team: $current_team" | |
fi | |
# catch a jupyter notebook | |
if [[ "$elem" == *.ipynb ]] | |
then | |
echo "copy: '$(pwd)/$elem'" | |
# this happens when it's a single persons submission | |
if [ -z "$current_team" ] | |
then | |
# extract name from path - extracted NBs | |
# paths look like $zip-dir-name/Abgaben/$Student_Name_s000000_Team-ID/$folders | |
# we rely on Abgaben and the student ID encapsulating the name for solo projects | |
name_of_student="${${$(pwd)##*Abgaben/}%_s*}" | |
new_name="$name_of_student-$elem" | |
# case that we got a team name - the case when a group submission | |
else | |
new_name="$current_team-$elem" | |
fi | |
# error handling for cp | |
# TODO: why does this even occur, the files that can't be copied don't exist... | |
if cp "$elem" "$target_folder/$new_name" | |
then echo "---> $new_name" | |
else | |
echo "Can't copy $(pwd)/$elem" | |
fi | |
echo | |
fi | |
# recurse into deeper folder | |
if [[ -d $elem ]] | |
then | |
# pwd | |
# echo "Recursing to ${elem}!" | |
walk_folder "$elem" "$current_team" | |
fi | |
done | |
cd .. # go up again | |
# pwd | |
} | |
########### | |
## START ## | |
########### | |
## PARSING ## | |
# parse options | |
zip=false | |
tutor_zip=false | |
clear_zip_afterwards=false | |
while getopts ":zt:" opt; do | |
case $opt in | |
z) # if folder copied to shall be zipped at the end | |
zip=true | |
;; | |
p) | |
clear_zip_afterwards=true | |
;; | |
t) # directory the tutor file or other zip that shall be copied is located | |
tutor_zip=$OPTARG | |
;; | |
\?) | |
echo "Invalid option: -$OPTARG" >&2 | |
exit 1 | |
;; | |
:) | |
echo "Option -$OPTARG requires an argument." >&2 | |
exit 1 | |
;; | |
esac | |
done | |
# shift args that were parsed away | |
shift $((OPTIND-1)) # now there just the positional args | |
# ensure that positional args are given - not checking for validity | |
if [[ ! $1 ]] | |
then | |
echo "No target to start with given - exiting" | |
exit 1 | |
fi | |
if [[ ! $2 ]] | |
then | |
echo "No target directory to copy to given - exiting" | |
exit 1 | |
fi | |
## SETUP ## | |
start="$(pwd)/$1" # path where start zip is located | |
target_folder="$(pwd)/$2" # 'global' var that specifies target folder for notebooks | |
# create target directory if not existing | |
if [[ ! -d $2 ]] | |
then | |
mkdir "$2" | |
fi | |
## MAIN ## | |
unzip -oq "$start" # unzip given file | |
# unzipped_dir="${${$start%.zip}##*/}" # zip is extracted to local dir | |
unzipped_dir="${start##*/}" | |
unzipped_dir="${unzipped_dir%.zip}" | |
echo | |
walk_folder "${unzipped_dir}" # start recursion | |
if [[ $clear_zip_afterwards != "false" ]] | |
then | |
rm -r "${unzipped_dir}" # remove extracted folder | |
fi | |
## OPTIONS ## | |
# if path to tutor zip is given: | |
# unzip, move content to target_folder and delete extracted folder | |
if [[ $tutor_zip != "false" ]] | |
then | |
echo "Tying to extract content from $tutor_zip" | |
tutor_dir="${tutor_zip%.zip}" # get rid of last slash | |
tutor_dir="${tutor_dir##*/}" # cut only content after last slash | |
unzip -q "$tutor_zip" && mv "$tutor_dir"/* "$target_folder" && rm -r "$tutor_dir" | |
echo "Tutor content is copied to directory" | |
fi | |
# if content shall be zipped afterwards | |
if [[ $zip != "false" ]] | |
then | |
# remove and then zip new, it's more silent than overwriting with zip cmd | |
# use FSr to simply overwrite zip with new one | |
echo "Zipping $target_folder" | |
cleaned_dir="${target_folder%/}" # get rid of last slash | |
cleaned_dir="${cleaned_dir##*/}" # cut only content after last slash | |
rm -f "${cleaned_dir}.zip" # remove old zip if zip with this name exists | |
zip -r "${cleaned_dir}.zip" "$cleaned_dir/" | |
rm -r "$target_folder" | |
echo "zip is created: ${cleaned_dir}.zip - original folder was deleted" | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment