SydLambert/README.md

## README.md

      
    Raw
  

              README.md
            
          
    typeracer-text-scraper.pl

A Perl program to scrape all texts from Typeracer. Made with quick regular expressions, so no guarantees it will still work if they change their UI.
Installation

This program requires the following non-core Perl modules:

LWP::Simple
Mozilla::CA
JSON

Install them with:
cpan LWP::Simple Mozilla::CA JSON
Usage

chmod +x typeracer-text-scraper.pl
./typeracer-text-scraper.pl
Output

Typeracer data will be written to the file data.json in JSON format.
Output Example

[
    {
        "id": 4180447,
        "text": "Don't it make you sad to know that life is more than who you are?",
        "amazonQuery": "B000002MUE",
        "source": "Name",
        "type": "song",
        "author": "The Goo Goo Dolls",
        "avgSpeed": 154,
        "avgAccuracy": 98.7
    },
    {
        "id": 4180545,
        "text": "You must take life the way it comes at you and make the best of it.",
        "amazonQuery": "0156027321",
        "source": "Life of Pi",
        "type": "book",
        "author": "Yann Martel",
        "avgSpeed": 130,
        "avgAccuracy": 98.1
    },
    {
        "id": 4180144,
        "text": "I'm sure there are things you know that you don't even know you know.",
        "amazonQuery": "1442423692",
        "source": "UnSouled",
        "type": "book",
        "author": "Neal Shusterman",
        "avgSpeed": 144,
        "avgAccuracy": 98
    },
    {
        "id": 3810446,
        "text": "They don't know that we know they know we know.",
        "amazonQuery": "B000H6SXMY",
        "source": "Friends",
        "type": "other",
        "author": "David Crane and Marta Kauffman",
        "avgSpeed": 158,
        "avgAccuracy": 98.1
    },
    {
        "id": 3550533,
        "text": "Perhaps if you know you are insane then you are not insane.",
        "amazonQuery": "0547572484",
        "source": "The Man in the High Castle",
        "type": "book",
        "author": "Philip K. Dick",
        "avgSpeed": 145,
        "avgAccuracy": 98.2
    },

    ...

]

  
## typeracer-text-scraper.pl
#!/usr/bin/env perl

use warnings;
use strict;
use LWP::Simple;
use HTML::Entities;
use JSON;

$|++;

truncate "data.json", 0;
open(FH, '>>', "data.json") or die $!;
print FH "[";

print "Downloading texts list...\n";

my @texts=get("http://typeracerdata.com/texts") =~ /text\?id=(\d+)"/g;

for my $i (0 .. $#texts) {
    print "\rScraping ".($i)." / $#texts...";

    my $info=get("https://data.typeracer.com/pit/text_info?id=".$texts[$i]);

    my (
        $text,
        $amazonQuery,
        $source,
        $type,
        $author,
        $avgSpeed,
        $avgAccuracy
    ) = $info =~ /
        .*?fullTextStr">(.+?)<\/div> # Full text
        .*?keywords=(.+?)&           # Amazon query string
        .*?">(.+?)<\/a>              # Source of text
        .*?\/>\((.+?)\)              # Type of source
        \s*?by\s(.+?)[^\w .]         # Author of source
        .*?%;">(\d+)\s               # Average WPM
        .*?<td>([\d.]+)              # Average accuracy
    /msx;

    if (defined $text) {
        print FH encode_json({
            id          => $texts[$i]*1,
            text        => decode_entities($text),
            amazonQuery => $amazonQuery,
            source      => decode_entities($source),
            type        => $type,
            author      => decode_entities($author),
            avgSpeed    => $avgSpeed*1,
            avgAccuracy => $avgAccuracy*1
        }).($i != $#texts ? "," : "");
    } else {
        print "\nCould not parse $texts[$i]\n";
    }
}

print FH "]";
close FH;

print "\nDone.\n";
	#!/usr/bin/env perl

	use warnings;
	use strict;
	use LWP::Simple;
	use HTML::Entities;
	use JSON;

	$\|++;

	truncate "data.json", 0;
	open(FH, '>>', "data.json") or die $!;
	print FH "[";

	print "Downloading texts list...\n";

	my @texts=get("http://typeracerdata.com/texts") =~ /text\?id=(\d+)"/g;

	for my $i (0 .. $#texts) {
	print "\rScraping ".($i)." / $#texts...";

	my $info=get("https://data.typeracer.com/pit/text_info?id=".$texts[$i]);

	my (
	$text,
	$amazonQuery,
	$source,
	$type,
	$author,
	$avgSpeed,
	$avgAccuracy
	) = $info =~ /
	.*?fullTextStr">(.+?)<\/div> # Full text
	.*?keywords=(.+?)& # Amazon query string
	.*?">(.+?)<\/a> # Source of text
	.*?\/>\((.+?)\) # Type of source
	\s*?by\s(.+?)[^\w .] # Author of source
	.*?%;">(\d+)\s # Average WPM
	.*?<td>([\d.]+) # Average accuracy
	/msx;

	if (defined $text) {
	print FH encode_json({
	id => $texts[$i]*1,
	text => decode_entities($text),
	amazonQuery => $amazonQuery,
	source => decode_entities($source),
	type => $type,
	author => decode_entities($author),
	avgSpeed => $avgSpeed*1,
	avgAccuracy => $avgAccuracy*1
	}).($i != $#texts ? "," : "");
	} else {
	print "\nCould not parse $texts[$i]\n";
	}
	}

	print FH "]";
	close FH;

	print "\nDone.\n";