Skip to content

Instantly share code, notes, and snippets.

@SimonSchubert
Last active February 25, 2017 17:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SimonSchubert/6f0715610cb37f4fda12 to your computer and use it in GitHub Desktop.
Save SimonSchubert/6f0715610cb37f4fda12 to your computer and use it in GitHub Desktop.
Loop linux man pages and save to SQLite
#!/bin/bash
OUT_DIR="/home/simon/manpages"
DATABASE="/home/simon/commands.db"
sqlite3 $DATABASE "PRAGMA encoding='UTF-8';"
sqlite3 $DATABASE "drop table if exists commands;"
sqlite3 $DATABASE "create table commands (_id integer primary key, category numeric, name text, description text, manpage text);"
path=( "man1" "man2" "man8" "man6" )
categories=(1 2 8 6)
index=0
for p in "${path[@]}"; do
category="${categories[$category]}"
cd "/usr/share/man/"$p
for i in `find -name '*.gz' -type f -printf "%f\n"`; do
dname=`dirname $i`
mkdir -p $OUT_DIR/$dname
title=${i::-5}
# Fetch man page as html
manpage="$(zcat $i | groff -mandoc -Thtml)"
# Replace double quotes with single quotes
manpage="${manpage//\"/\'}"
# Remove style
manpage="$(echo $manpage | sed -r 's/<style([^<]|<[^\/]|<\/[^s]|<\/s[^t])*<\/style>//g')"
# Remove header
manpage="$(echo $manpage | sed -r 's/<head([^<]|<[^\/]|<\/[^s]|<\/s[^t])*<\/head>//g')"
# Remove comments
manpage="$(echo $manpage | sed -r 's/<!--.*?-->//g')"
# Remove structure list
manpage="$(echo $manpage | sed -r 's/<h1 align.*?<br> <hr>//g')"
# Change header colors
# manpage="${manpage//\<h2/\<h2 style=\'color:#FFD9D9\'}"
manpage="${manpage//\<h2/\<font color=\'#FFD9D9\'><h2}"
manpage="${manpage//\<\/h2>/\<\/h2><\/font>}"
# Replace dash with minus
manpage="${manpage//&mdash;/-}"
manpage="${manpage//&minus;/-}"
# Strip out description
description="$(echo $manpage | awk -v FS="( - |</p>)" '{print $2}')"
# Remove html tags from description
description="$(echo $description | sed -r 's/<[^<]*>//g')"
echo "COMMAND: "$title
name="NAME"
if [ "$description" == "$name" ];
then
echo "failed"
description="$(echo $manpage | sed -n 's:.* - \([^<]*\)<\/p>.*:\1:p')"
# Remove html tags from description again
description="$(echo $description | sed -r 's/<[^<]*>//g')"
fi
description="${description//&rsquo;/\'}"
sqlite3 $DATABASE "INSERT INTO commands (category, name, description, manpage) VALUES (\"$category\",\"$title\",\"$description\",\"$manpage\")";
done
((index++))
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment