Created
May 30, 2020 02:01
-
-
Save acharles7/feb2c7253fec7275340580a222d7fa1f to your computer and use it in GitHub Desktop.
PG-19 dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"citation": "@article{raecompressive2019,\nauthor = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and\n Hillier, Chloe and Lillicrap, Timothy P},\ntitle = {Compressive Transformers for Long-Range Sequence Modelling},\njournal = {arXiv preprint},\nurl = {https://arxiv.org/abs/1911.05507},\nyear = {2019},\n}", | |
"description": "This dataset contains the PG-19 language modeling benchmark. It includes a set\nof books extracted from the Project Gutenberg books project\n(https://www.gutenberg.org), that were published before 1919. It also contains\nmetadata of book titles and publication dates.\nPG-19 is over double the size of the Billion Word benchmark and contains\ndocuments that are 20X longer, on average, than the WikiText long-range\nlanguage modelling benchmark.\n\nBooks are partitioned into a train, validation, and test set. Books metadata is\nstored in metadata.csv which contains\n(book_id, short_book_title, publication_date, book_link).", | |
"location": { | |
"urls": [ | |
"https://github.com/deepmind/pg19" | |
] | |
}, | |
"name": "pg19", | |
"splits": [ | |
{ | |
"name": "test", | |
"numBytes": "41308494", | |
"shardLengths": [ | |
"100" | |
] | |
}, | |
{ | |
"name": "validation", | |
"numBytes": "17742608", | |
"shardLengths": [ | |
"50" | |
] | |
} | |
], | |
"version": "0.1.0" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment