JKirchartz/convert.sh

## convert.sh
#! /bin/sh
#
# convert.sh
#
# Copyleft (ↄ) 2015 jkirchartz <jkirchartz@gmail.com>
#
# Distributed under terms of the NPL (Necessary Public License) license.
#

for file in ./sources/*.txt; do
  title=$(grep Title: $file | cut -d' ' -f2- | tr ' ' '_' | head)
  stripgutenberg.pl < $file > ./corpora/$title.txt
done

## download.sh
#! /bin/sh
#
# download.sh
# download everything in ./doyle.list
# Copyleft (ↄ) 2015 jkirchartz <jkirchartz@gmail.com>
#
# Distributed under terms of the NPL (Necessary Public License) license.
#
# Project Gutenberg doesn't really like this, so you only have 1 shot before you get 403'd
#

wget -i doyle.list -P ./sources

## doyle.list
http://www.gutenberg.org/cache/epub/1661/pg1661.txt
http://www.gutenberg.org/cache/epub/2852/pg2852.txt
http://www.gutenberg.org/cache/epub/244/pg244.txt
http://www.gutenberg.org/cache/epub/2097/pg2097.txt
http://www.gutenberg.org/cache/epub/108/pg108.txt
http://www.gutenberg.org/cache/epub/834/pg834.txt
http://www.gutenberg.org/cache/epub/3289/pg3289.txt
http://www.gutenberg.org/cache/epub/2350/pg2350.txt
http://www.gutenberg.org/cache/epub/2344/pg2344.txt
http://www.gutenberg.org/cache/epub/2347/pg2347.txt
http://www.gutenberg.org/cache/epub/2343/pg2343.txt
http://www.gutenberg.org/cache/epub/2346/pg2346.txt
http://www.gutenberg.org/cache/epub/2349/pg2349.txt
http://www.gutenberg.org/cache/epub/2345/pg2345.txt
http://www.gutenberg.org/cache/epub/903/pg903.txt
http://www.gutenberg.org/cache/epub/126/pg126.txt
http://www.gutenberg.org/cache/epub/139/pg139.txt

## stripgutenberg.pl
#!/usr/bin/perl

# stripgutenberg.pl < in.txt > out.txt
#
# designed for piping
# Written by Andrew Dunbar (hippietrail), released into the public domain, Dec 2010

use strict;

my $debug = 0;

my $state = 'beginning';
my $print = 0;
my $printed = 0;

while (1) {
    $_ = <>;

    last unless $_;

    # strip UTF-8 BOM
    if ($. == 1 && index($_, "\xef\xbb\xbf") == 0) {
        $_ = substr($_, 3);
    }

    if ($state eq 'beginning') {
        if (/^(The Project Gutenberg [Ee]Book( of|,)|Project Gutenberg's )/) {
            $state = 'normal pg header';
            $debug && print "state: beginning -> normal pg header\n";
            $print = 0;
        } elsif (/^$/) {
            $state = 'beginning blanks';
            $debug && print "state: beginning -> beginning blanks\n";
        } else {
            die "unrecognized beginning: $_";
        }
    } elsif ($state eq 'normal pg header') {
        if (/^\*\*\*\ ?START OF TH(IS|E) PROJECT GUTENBERG EBOOK,? /) {
            $state = 'end of normal header';
            $debug && print "state: normal pg header -> end of normal pg header\n";
        } else {
            # body of normal pg header
        }
    } elsif ($state eq 'end of normal header') {
        if (/^(Produced by|Transcribed from)/) {
            $state = 'post header';
            $debug && print "state: end of normal pg header -> post header\n";
        } elsif (/^$/) {
            # blank lines
        } else {
            $state = 'etext body';
            $debug && print "state: end of normal header -> etext body\n";
            $print = 1;
        }
    } elsif ($state eq 'post header') {
        if (/^$/) {
            $state = 'blanks after post header';
            $debug && print "state: post header -> blanks after post header\n";
        } else {
            # multiline Produced / Transcribed
        }
    } elsif ($state eq 'blanks after post header') {
        if (/^$/) {
            # more blank lines
        } else {
            $state = 'etext body';
            $debug && print "state: blanks after post header -> etext body\n";
            $print = 1;
        }
    } elsif ($state eq 'beginning blanks') {
        if (/<!-- #INCLUDE virtual=\"\/include\/ga-books-texth\.html\" -->/) {
            $state = 'header include';
            $debug && print "state: beginning blanks -> header include\n";
        } elsif (/^Title: /) {
            $state = 'aus header';
            $debug && print "state: beginning blanks -> aus header\n";
        } elsif (/^$/) {
            # more blanks
        } else {
            die "unexpected stuff after beginning blanks: $_";
        }
    } elsif ($state eq 'header include') {
        if (/^$/) {
            # blanks after header include
        } else {
            $state = 'aus header';
            $debug && print "state: header include -> aus header\n";
        }
    } elsif ($state eq 'aus header') {
        if (/^To contact Project Gutenberg of Australia go to http:\/\/gutenberg\.net\.au$/) {
            $state = 'end of aus header';
            $debug && print "state: aus header -> end of aus header\n";
        } elsif (/^A Project Gutenberg of Australia eBook$/) {
            $state = 'end of aus header';
            $debug && print "state: aus header -> end of aus header\n";
        }
    } elsif ($state eq 'end of aus header') {
        if (/^((Title|Author): .*)?$/) {
            # title, author, or blank line
        } else {
            $state = 'etext body';
            $debug && print "state: end of aus header -> etext body\n";
            $print = 1;
        }
    } elsif ($state eq 'etext body') {
        # here's the stuff
        if (/^<!-- #INCLUDE virtual="\/include\/ga-books-textf\.html" -->$/) {
            $state = 'footer';
            $debug && print "state: etext body -> footer\n";
            $print = 0;
        } elsif (/^(\*\*\* ?)?end of (the )?project/i) {
            $state = 'footer';
            $debug && print "state: etext body -> footer\n";
            $print = 0;
        }
    } elsif ($state eq 'footer') {
        # nothing more of interest
    } else {
        die "unknown state '$state'";
    }

    if ($print) {
        print;
        ++$printed;
    } else {
        $debug && print "## $_";
    }
}
	#! /bin/sh
	#
	# convert.sh
	#
	# Copyleft (ↄ) 2015 jkirchartz <jkirchartz@gmail.com>
	#
	# Distributed under terms of the NPL (Necessary Public License) license.
	#

	for file in ./sources/*.txt; do
	title=$(grep Title: $file \| cut -d' ' -f2- \| tr ' ' '_' \| head)
	stripgutenberg.pl < $file > ./corpora/$title.txt
	done
	#! /bin/sh
	#
	# download.sh
	# download everything in ./doyle.list
	# Copyleft (ↄ) 2015 jkirchartz <jkirchartz@gmail.com>
	#
	# Distributed under terms of the NPL (Necessary Public License) license.
	#
	# Project Gutenberg doesn't really like this, so you only have 1 shot before you get 403'd
	#

	wget -i doyle.list -P ./sources
	http://www.gutenberg.org/cache/epub/1661/pg1661.txt
	http://www.gutenberg.org/cache/epub/2852/pg2852.txt
	http://www.gutenberg.org/cache/epub/244/pg244.txt
	http://www.gutenberg.org/cache/epub/2097/pg2097.txt
	http://www.gutenberg.org/cache/epub/108/pg108.txt
	http://www.gutenberg.org/cache/epub/834/pg834.txt
	http://www.gutenberg.org/cache/epub/3289/pg3289.txt
	http://www.gutenberg.org/cache/epub/2350/pg2350.txt
	http://www.gutenberg.org/cache/epub/2344/pg2344.txt
	http://www.gutenberg.org/cache/epub/2347/pg2347.txt
	http://www.gutenberg.org/cache/epub/2343/pg2343.txt
	http://www.gutenberg.org/cache/epub/2346/pg2346.txt
	http://www.gutenberg.org/cache/epub/2349/pg2349.txt
	http://www.gutenberg.org/cache/epub/2345/pg2345.txt
	http://www.gutenberg.org/cache/epub/903/pg903.txt
	http://www.gutenberg.org/cache/epub/126/pg126.txt
	http://www.gutenberg.org/cache/epub/139/pg139.txt
	#!/usr/bin/perl

	# stripgutenberg.pl < in.txt > out.txt
	#
	# designed for piping
	# Written by Andrew Dunbar (hippietrail), released into the public domain, Dec 2010

	use strict;

	my $debug = 0;

	my $state = 'beginning';
	my $print = 0;
	my $printed = 0;

	while (1) {
	$_ = <>;

	last unless $_;

	# strip UTF-8 BOM
	if ($. == 1 && index($_, "\xef\xbb\xbf") == 0) {
	$_ = substr($_, 3);
	}

	if ($state eq 'beginning') {
	if (/^(The Project Gutenberg [Ee]Book( of\|,)\|Project Gutenberg's )/) {
	$state = 'normal pg header';
	$debug && print "state: beginning -> normal pg header\n";
	$print = 0;
	} elsif (/^$/) {
	$state = 'beginning blanks';
	$debug && print "state: beginning -> beginning blanks\n";
	} else {
	die "unrecognized beginning: $_";
	}
	} elsif ($state eq 'normal pg header') {
	if (/^\\\*\ ?START OF TH(IS\|E) PROJECT GUTENBERG EBOOK,? /) {
	$state = 'end of normal header';
	$debug && print "state: normal pg header -> end of normal pg header\n";
	} else {
	# body of normal pg header
	}
	} elsif ($state eq 'end of normal header') {
	if (/^(Produced by\|Transcribed from)/) {
	$state = 'post header';
	$debug && print "state: end of normal pg header -> post header\n";
	} elsif (/^$/) {
	# blank lines
	} else {
	$state = 'etext body';
	$debug && print "state: end of normal header -> etext body\n";
	$print = 1;
	}
	} elsif ($state eq 'post header') {
	if (/^$/) {
	$state = 'blanks after post header';
	$debug && print "state: post header -> blanks after post header\n";
	} else {
	# multiline Produced / Transcribed
	}
	} elsif ($state eq 'blanks after post header') {
	if (/^$/) {
	# more blank lines
	} else {
	$state = 'etext body';
	$debug && print "state: blanks after post header -> etext body\n";
	$print = 1;
	}
	} elsif ($state eq 'beginning blanks') {
	if (/<!-- #INCLUDE virtual=\"\/include\/ga-books-texth\.html\" -->/) {
	$state = 'header include';
	$debug && print "state: beginning blanks -> header include\n";
	} elsif (/^Title: /) {
	$state = 'aus header';
	$debug && print "state: beginning blanks -> aus header\n";
	} elsif (/^$/) {
	# more blanks
	} else {
	die "unexpected stuff after beginning blanks: $_";
	}
	} elsif ($state eq 'header include') {
	if (/^$/) {
	# blanks after header include
	} else {
	$state = 'aus header';
	$debug && print "state: header include -> aus header\n";
	}
	} elsif ($state eq 'aus header') {
	if (/^To contact Project Gutenberg of Australia go to http:\/\/gutenberg\.net\.au$/) {
	$state = 'end of aus header';
	$debug && print "state: aus header -> end of aus header\n";
	} elsif (/^A Project Gutenberg of Australia eBook$/) {
	$state = 'end of aus header';
	$debug && print "state: aus header -> end of aus header\n";
	}
	} elsif ($state eq 'end of aus header') {
	if (/^((Title\|Author): .*)?$/) {
	# title, author, or blank line
	} else {
	$state = 'etext body';
	$debug && print "state: end of aus header -> etext body\n";
	$print = 1;
	}
	} elsif ($state eq 'etext body') {
	# here's the stuff
	if (/^<!-- #INCLUDE virtual="\/include\/ga-books-textf\.html" -->$/) {
	$state = 'footer';
	$debug && print "state: etext body -> footer\n";
	$print = 0;
	} elsif (/^(\\\* ?)?end of (the )?project/i) {
	$state = 'footer';
	$debug && print "state: etext body -> footer\n";
	$print = 0;
	}
	} elsif ($state eq 'footer') {
	# nothing more of interest
	} else {
	die "unknown state '$state'";
	}

	if ($print) {
	print;
	++$printed;
	} else {
	$debug && print "## $_";
	}
	}