Skip to content

Instantly share code, notes, and snippets.

@nonchris
Last active January 5, 2023 17:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nonchris/3cdf506575069991181f0f265ddb059b to your computer and use it in GitHub Desktop.
Save nonchris/3cdf506575069991181f0f265ddb059b to your computer and use it in GitHub Desktop.
Extract notebooks from nested zip-files
#!/usr/bin/env zsh
# tool to collect jupyter notebooks from the "download all zip from eCampus"
# options
# -z: zip folder with extracted content at the end
# -p: persitent -> unzipped folder not be deleted
# -t [file]: include full content of other zip in target folder (e.g. the tutor zip)
# positional arguments (needed to be positioned after optional ones)
# arg1: zip to start with
# arg2: folder to copy notebooks to
# example usage:
# ./extract.sh -z -t 05_tutor.zip 07-Exercise.zip extracted-nbs
# most basic:
# ./extract.sh Blatt-04.zip blatt-04
# Credits:
# Chris G. (https://github.com/nonchris/)
# function that recruses trough all folders in a given directory
# unzips all zip-files and recurses into the extracted directory
# collects all jupyter-notebooks on its way
# copy those to a specified path and names them after group-number it was found in
walk_folder () { # args: directory name to cd into, ID/Name of the team the sub-dirs are from (to name file when copied)
# echo "enter: $1"
cd "$1" || echo "Can't CD into $1" # enter directory to inspect
# Check is dir is empty - break up then
if [ -z "$(ls -A .)" ]
then
echo "$1 is empty"
cd ..
return
fi
# loop over all elements in directory
for elem in *
do
# echo " elem: $elem, $(pwd)"
# check which team we are in - folders are named "TEAM XXXXX" with X as a digit
# we can rely on this when downloading from ecampus, using this param to better identify copied files
current_team=$2
if [[ "$elem" == Team* ]]
then
current_team="${elem#* }"
fi
# echo "$current_team" TEAM
# ignore that stupid folder its just annoying...
if [[ "$elem" == "__MACOSX" ]]
then continue
fi
# if we discover a new zip file - extract and dispatch recursion with extracted folder
# why? well because extracted folder won't be in elements that this loop here iterates over
if [[ "$elem" == *.zip ]]
then
extract_to="${elem%.zip}"
unzip -oq "$elem" -d "$extract_to" && walk_folder "$extract_to" "$current_team" ||
# zip -FF "$elem" --out "$elem-fixed" -fz && unzip -o "$elem-fixed" && walk_folder "${"$elem-fixed"%.zip}" "$current_team" ||
echo "ERROR: Can't unzip $elem Team: $current_team"
fi
# catch a jupyter notebook
if [[ "$elem" == *.ipynb ]]
then
echo "copy: '$(pwd)/$elem'"
# this happens when it's a single persons submission
if [ -z "$current_team" ]
then
# extract name from path - extracted NBs
# paths look like $zip-dir-name/Abgaben/$Student_Name_s000000_Team-ID/$folders
# we rely on Abgaben and the student ID encapsulating the name for solo projects
name_of_student="${${$(pwd)##*Abgaben/}%_s*}"
new_name="$name_of_student-$elem"
# case that we got a team name - the case when a group submission
else
new_name="$current_team-$elem"
fi
# error handling for cp
# TODO: why does this even occur, the files that can't be copied don't exist...
if cp "$elem" "$target_folder/$new_name"
then echo "---> $new_name"
else
echo "Can't copy $(pwd)/$elem"
fi
echo
fi
# recurse into deeper folder
if [[ -d $elem ]]
then
# pwd
# echo "Recursing to ${elem}!"
walk_folder "$elem" "$current_team"
fi
done
cd .. # go up again
# pwd
}
###########
## START ##
###########
## PARSING ##
# parse options
zip=false
tutor_zip=false
clear_zip_afterwards=false
while getopts ":zt:" opt; do
case $opt in
z) # if folder copied to shall be zipped at the end
zip=true
;;
p)
clear_zip_afterwards=true
;;
t) # directory the tutor file or other zip that shall be copied is located
tutor_zip=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
# shift args that were parsed away
shift $((OPTIND-1)) # now there just the positional args
# ensure that positional args are given - not checking for validity
if [[ ! $1 ]]
then
echo "No target to start with given - exiting"
exit 1
fi
if [[ ! $2 ]]
then
echo "No target directory to copy to given - exiting"
exit 1
fi
## SETUP ##
start="$(pwd)/$1" # path where start zip is located
target_folder="$(pwd)/$2" # 'global' var that specifies target folder for notebooks
# create target directory if not existing
if [[ ! -d $2 ]]
then
mkdir "$2"
fi
## MAIN ##
unzip -oq "$start" # unzip given file
# unzipped_dir="${${$start%.zip}##*/}" # zip is extracted to local dir
unzipped_dir="${start##*/}"
unzipped_dir="${unzipped_dir%.zip}"
echo
walk_folder "${unzipped_dir}" # start recursion
if [[ $clear_zip_afterwards != "false" ]]
then
rm -r "${unzipped_dir}" # remove extracted folder
fi
## OPTIONS ##
# if path to tutor zip is given:
# unzip, move content to target_folder and delete extracted folder
if [[ $tutor_zip != "false" ]]
then
echo "Tying to extract content from $tutor_zip"
tutor_dir="${tutor_zip%.zip}" # get rid of last slash
tutor_dir="${tutor_dir##*/}" # cut only content after last slash
unzip -q "$tutor_zip" && mv "$tutor_dir"/* "$target_folder" && rm -r "$tutor_dir"
echo "Tutor content is copied to directory"
fi
# if content shall be zipped afterwards
if [[ $zip != "false" ]]
then
# remove and then zip new, it's more silent than overwriting with zip cmd
# use FSr to simply overwrite zip with new one
echo "Zipping $target_folder"
cleaned_dir="${target_folder%/}" # get rid of last slash
cleaned_dir="${cleaned_dir##*/}" # cut only content after last slash
rm -f "${cleaned_dir}.zip" # remove old zip if zip with this name exists
zip -r "${cleaned_dir}.zip" "$cleaned_dir/"
rm -r "$target_folder"
echo "zip is created: ${cleaned_dir}.zip - original folder was deleted"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment