Skip to content

Instantly share code, notes, and snippets.

@hideshi
Created March 24, 2015 11:24
Show Gist options
  • Save hideshi/50484e31f5e12045e223 to your computer and use it in GitHub Desktop.
Save hideshi/50484e31f5e12045e223 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/gawk -f
/<page>/ {
flg=1;
}
/<\/page>/ {
flg=0;
gsub(/^\s+/,"",s);
gsub(/\s+/," ",s);
print s;
s="";
}
flg == 1 {
gsub(/&lt;/,"<",$0);
gsub(/&gt;/,">",$0);
gsub(/&quot;/,"\"",$0);
gsub(/&amp;amp;/,"&",$0);
gsub(/<ns>.+<\/ns>/,"",$0);
gsub(/<ip>.+<\/ip>/,"",$0);
gsub(/<id>.+<\/id>/,"",$0);
gsub(/<ref.+\/>/,"",$0);
gsub(/<ref.*>/,"",$0);
gsub(/<\/ref>/,"",$0);
gsub(/<span.*>/,"",$0);
gsub(/<\/span>/,"",$0);
gsub(/<parentid>.+<\/parentid>/,"",$0);
gsub(/<restrictions>.+<\/restrictions>/,"",$0);
gsub(/<timestamp>.+<\/timestamp>/,"",$0);
gsub(/<username>.+<\/username>/,"",$0);
gsub(/<comment>.+<\/comment>/,"",$0);
gsub(/<model>.+<\/model>/,"",$0);
gsub(/<format>.+<\/format>/,"",$0);
gsub(/<sha1>.+<\/sha1>/,"",$0);
gsub(/<br>/,"",$0);
gsub(/<br\s*\/>/,"",$0);
gsub(/<page>/,"",$0);
gsub(/<\/page>/,"",$0);
gsub(/<revision>/,"",$0);
gsub(/<\/revision>/,"",$0);
gsub(/<contributor>/,"",$0);
gsub(/<\/contributor>/,"",$0);
gsub(/<text xml:space="preserve">/,"",$0);
gsub(/<\/text>/,"",$0);
gsub(/<code>/,"",$0);
gsub(/<\/code>/,"",$0);
gsub(/<em>/,"",$0);
gsub(/<\/em>/,"",$0);
gsub(/<ul>/,"",$0);
gsub(/<\/ul>/,"",$0);
gsub(/<li>/,"",$0);
gsub(/<\/li>/,"",$0);
gsub(/<\/redirect>/,"",$0);
gsub(/<small>/,"",$0);
gsub(/<\/small>/,"",$0);
gsub(/<minor \/>/,"",$0);
gsub(/#REDIRECT Wikipedia:/,"",$0);
gsub(/#REDIRECT/,"",$0);
gsub(/\{\{[^{]+\}\}/,"",$0);
gsub(/\[\[/,"",$0);
gsub(/\]\]/,"",$0);
gsub(/\s+/," ",$0);
gsub(/[|]/,"",$0);
gsub(/[*='"]{1,}/,"",$0);
r = gensub(/<title>(.+)<\/title>/,"\\1", "", $0);
r = gensub(/<ref>(.+)<\/ref>/,"\\1", "", r);
r = gensub(/<redirect title="(.+)"*\/>/,"\\1","",r);
s=s r;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment