Created
April 25, 2025 09:13
-
-
Save limcheekin/013768ca293eb928f04188275ed7b168 to your computer and use it in GitHub Desktop.
Convert docx to markdown using pandoc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# RegEx \s*\{width="[^"]*"\s*height="[^"]*"\} to replace the entire {width="..." height="..."} attribute block | |
#!/bin/bash | |
# Exit immediately if a command exits with a non-zero status | |
set -e | |
# Check if an input file was provided | |
if [ -z "$1" ]; then | |
echo "Usage: $0 input.docx" | |
exit 1 | |
fi | |
# Assign the input file | |
INPUT_FILE="$1" | |
# Verify that the input file exists | |
if [ ! -f "$INPUT_FILE" ]; then | |
echo "Error: File '$INPUT_FILE' not found." | |
exit 1 | |
fi | |
# Extract the base name (without extension) for output naming | |
BASE_NAME="${INPUT_FILE%.*}" | |
# Define output Markdown file and media directory | |
OUTPUT_MD="${BASE_NAME}.md" | |
MEDIA_DIR="${BASE_NAME}_media" | |
# Run Pandoc to convert DOCX to Markdown and extract media | |
pandoc "$INPUT_FILE" -o "$OUTPUT_MD" --extract-media="$MEDIA_DIR" | |
echo "Conversion complete:" | |
echo "Markdown file: $OUTPUT_MD" | |
echo "Media directory: $MEDIA_DIR" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment