Last active
February 25, 2017 17:47
-
-
Save SimonSchubert/6f0715610cb37f4fda12 to your computer and use it in GitHub Desktop.
Loop linux man pages and save to SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
OUT_DIR="/home/simon/manpages" | |
DATABASE="/home/simon/commands.db" | |
sqlite3 $DATABASE "PRAGMA encoding='UTF-8';" | |
sqlite3 $DATABASE "drop table if exists commands;" | |
sqlite3 $DATABASE "create table commands (_id integer primary key, category numeric, name text, description text, manpage text);" | |
path=( "man1" "man2" "man8" "man6" ) | |
categories=(1 2 8 6) | |
index=0 | |
for p in "${path[@]}"; do | |
category="${categories[$category]}" | |
cd "/usr/share/man/"$p | |
for i in `find -name '*.gz' -type f -printf "%f\n"`; do | |
dname=`dirname $i` | |
mkdir -p $OUT_DIR/$dname | |
title=${i::-5} | |
# Fetch man page as html | |
manpage="$(zcat $i | groff -mandoc -Thtml)" | |
# Replace double quotes with single quotes | |
manpage="${manpage//\"/\'}" | |
# Remove style | |
manpage="$(echo $manpage | sed -r 's/<style([^<]|<[^\/]|<\/[^s]|<\/s[^t])*<\/style>//g')" | |
# Remove header | |
manpage="$(echo $manpage | sed -r 's/<head([^<]|<[^\/]|<\/[^s]|<\/s[^t])*<\/head>//g')" | |
# Remove comments | |
manpage="$(echo $manpage | sed -r 's/<!--.*?-->//g')" | |
# Remove structure list | |
manpage="$(echo $manpage | sed -r 's/<h1 align.*?<br> <hr>//g')" | |
# Change header colors | |
# manpage="${manpage//\<h2/\<h2 style=\'color:#FFD9D9\'}" | |
manpage="${manpage//\<h2/\<font color=\'#FFD9D9\'><h2}" | |
manpage="${manpage//\<\/h2>/\<\/h2><\/font>}" | |
# Replace dash with minus | |
manpage="${manpage//—/-}" | |
manpage="${manpage//−/-}" | |
# Strip out description | |
description="$(echo $manpage | awk -v FS="( - |</p>)" '{print $2}')" | |
# Remove html tags from description | |
description="$(echo $description | sed -r 's/<[^<]*>//g')" | |
echo "COMMAND: "$title | |
name="NAME" | |
if [ "$description" == "$name" ]; | |
then | |
echo "failed" | |
description="$(echo $manpage | sed -n 's:.* - \([^<]*\)<\/p>.*:\1:p')" | |
# Remove html tags from description again | |
description="$(echo $description | sed -r 's/<[^<]*>//g')" | |
fi | |
description="${description//’/\'}" | |
sqlite3 $DATABASE "INSERT INTO commands (category, name, description, manpage) VALUES (\"$category\",\"$title\",\"$description\",\"$manpage\")"; | |
done | |
((index++)) | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment