Skip to content

Instantly share code, notes, and snippets.

@mrpapercut
Last active April 6, 2023 08:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrpapercut/6c11e215b4b5b528ac0ff678dae1e297 to your computer and use it in GitHub Desktop.
Save mrpapercut/6c11e215b4b5b528ac0ff678dae1e297 to your computer and use it in GitHub Desktop.
T5K primelist parser
<?php
class T5KParser {
private $primelistURL = 'https://t5k.org/primes/lists/all.txt';
private $primelistRaw;
public $primes = [];
public $proofcodes = [];
public function parse() {
$this->getList();
$this->parseList();
}
private function getList() {
$ch = curl_init();
$curlOpts = [
CURLOPT_URL => $this->primelistURL,
CURLOPT_RETURNTRANSFER => true
];
curl_setopt_array($ch, $curlOpts);
$rawlist = curl_exec($ch);
curl_close($ch);
$this->primelistRaw = $rawlist;
}
private function parseList() {
if (is_null($this->primelistRaw)) {
throw new Exception('Raw file is empty');
return;
}
$this->parsePrimes();
$this->parseProofcodes();
}
private function getPrimesFromRawList() {
$lines = explode(PHP_EOL, $this->primelistRaw);
$lineCount = count($lines);
$foundlines = [];
$line_idx = 31; // Skip intro header
$prime_idx = 1;
while ($line_idx < $lineCount) {
$line = trim($lines[$line_idx]);
$line_idx++;
if (!str_starts_with($line, $prime_idx)) {
if (str_ends_with($foundlines[$prime_idx - 1], '\\')) {
$foundlines[$prime_idx - 1] = str_replace('\\', '', $foundlines[$prime_idx - 1]) . $line;
} else {
$foundlines[$prime_idx - 1] .= ' ' . $line;
}
} else {
$foundlines[$prime_idx] = $line;
$prime_idx++;
}
}
return $foundlines;
}
private function parsePrimes() {
$rawlines = $this->getPrimesFromRawList();
/*
99% of cases
Regex 1: ^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)
Cases with sum like "10000000000000000000...(34053 other digits)...00000000000000532669"
Regex 2: ^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?
Case "5163e Ramanujan tau function at 199^4518 ECPP 57125 E3 2022 ECPP"
Regex 3: ^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?
*/
$regex1 = '/^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)/';
$regex2 = '/^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/';
$regex3 = '/^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/';
foreach ($rawlines as $line) {
preg_match($regex1, $line, $matches);
if (count($matches) <= 1) preg_match($regex2, $line, $matches);
if (count($matches) <= 1) preg_match($regex3, $line, $matches);
if (count($matches) <= 1) {
var_dump("Could not parse line '" . $line . "'");
continue;
}
$rank = intval($matches[1]);
$ranknote = $matches[2];
$sum = trim($matches[3]);
$digitlength = intval($matches[4]);
$proofcode = $matches[5];
$year = intval($matches[6]);
$comment = $matches[7] ? trim($matches[7]) : '';
$checksum = dechex(crc32($sum));
array_push($this->primes, [
'rank' => $rank,
'ranknote' => $ranknote,
'sum' => $sum,
'digitlength' => $digitlength,
'proofcode' => $proofcode,
'year' => $year,
'comment' => $comment,
'checksum' => $checksum
]);
}
}
private function parseProofcodes() {
$lines = explode(PHP_EOL, $this->primelistRaw);
$lineCount = count($lines);
$sectionStarted = false;
$proofcodeRegex = '/^([A-Za-z0-9]+)\s+(.*)$/';
$line_idx = 31; // Skip intro header
while ($line_idx < $lineCount) {
$line = $lines[$line_idx];
$line_idx++;
if ($sectionStarted === true) {
preg_match($proofcodeRegex, $line, $matches);
if (count($matches) > 1 && $matches[1] !== 'KEY') {
$proofcode = $matches[1];
$provers = $matches[2];
array_push($this->proofcodes, [
'proofcode' => $proofcode,
'provers' => $provers
]);
}
}
if ($sectionStarted === false && str_starts_with('KEY TO PROOF-CODES', $line)) {
$sectionStarted = true;
}
}
}
}
$t5kparser = new T5KParser();
$t5kparser->parse();
/*
Primes: $t5kparser->primes
Proofcodes: $t5kparser->proofcodes
*/
// npm install --save axios crc32
const axios = require('axios');
const crc32 = require('crc-32');
class T5KParser {
constructor() {
this.primelistURL = 'https://t5k.org/primes/lists/all.txt';
}
getChecksum(inputStr) {
const seed = 0x04C11DB7;
const checksum = crc32.buf(Buffer.from(inputStr, 'binary'), seed);
const bytearr = Uint8Array.of(
(checksum & 0xff000000) >> 24,
(checksum & 0x00ff0000) >> 16,
(checksum & 0x0000ff00) >> 8,
(checksum & 0x000000ff) >> 0,
);
let hexOut = '';
for (const idx in bytearr) {
hexOut += bytearr[idx].toString(16).padStart(2, '0')
}
return hexOut;
}
getProofcodesFromList(filecontents) {
const lines = filecontents.split('\n').map(l => l.trim());
let sectionStarted = false;
let line_idx = 31;
const proofcodeLineRegex = /^([A-Za-z0-9]+)\s+(.*)$/;
const proofcodes = [];
while (line_idx < lines.length) {
const line = lines[line_idx];
line_idx++;
if (sectionStarted && proofcodeLineRegex.test(line)) {
const [_, proofcode, provers] = line.match(proofcodeLineRegex);
proofcodes.push({
proofcode,
provers
});
}
if (line.startsWith('KEY TO PROOF-CODES')) {
sectionStarted = true;
}
}
return proofcodes;
}
getPrimesFromList(filecontents) {
const lines = filecontents.split('\n').map(l => l.trim());
const newlines = [];
let line_idx = 31; // Skip the intro header
let prime_idx = 1;
while (line_idx < lines.length) {
const line = lines[line_idx];
line_idx++;
if (line.startsWith('-----')) {
break;
}
if (!line.startsWith(prime_idx)) {
if (newlines[prime_idx - 1].endsWith('\\')) {
newlines[prime_idx - 1] = newlines[prime_idx - 1].slice(0, -1) + line;
} else {
newlines[prime_idx - 1] += ` ${line}`;
}
} else {
newlines[prime_idx] = line;
prime_idx++;
}
}
return newlines.filter(l => l);
}
async parse() {
const primelist = await axios.get(this.primelistURL);
this.proofcodes = this.parseProofcodes(primelist.data);
this.primes = this.parsePrimes(primelist.data);
}
parseProofcodes(filecontents) {
return this.getProofcodesFromList(filecontents);
}
parsePrimes(filecontents) {
const lines = this.getPrimesFromList(filecontents);
const parsedList = [];
// 99% of cases
const primeRegex1 = /^(\d{1,4})([a-z])?\s+(\S*)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)/;
// Cases with sum like "10000000000000000000...(34053 other digits)...00000000000000532669"
const primeRegex2 = /^(\d{1,4})([a-z])?\s+(\d+\.\.\.[\(\)\w\s]+\.\.\.\d+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/;
// Case "5163e Ramanujan tau function at 199^4518 ECPP 57125 E3 2022 ECPP"
const primeRegex3 = /^(\d{1,4})([a-z])?\s+([\w\s]+[\d\^]+[\sA-Za-z]+)\s+(\d+)\s([A-Za-z0-9]+)\s+(\d+)(.*)?/;
let prime_idx = 0;
lines.forEach(l => {
let usedRegex = null;
if (primeRegex1.test(l)) {
usedRegex = primeRegex1;
} else if (primeRegex2.test(l)) {
usedRegex = primeRegex2;
} else if (primeRegex3.test(l)) {
usedRegex = primeRegex3;
} else {
console.log(l);
return;
}
let [_, rank, rankNote, sum, digitlength, proofcode, year, comment] = l.match(usedRegex);
rank = parseInt(rank, 10);
sum = sum.trim();
digitlength = parseInt(digitlength, 10);
year = parseInt(year, 10);
comment = comment ? comment.trim() : '';
parsedList[prime_idx] = {
rank,
rankNote,
sum,
digitlength,
proofcode,
year,
comment,
provers: this.parsedProofcodes.find(p => p.proofcode === proofcode).provers,
checksum: this.getChecksum(sum)
};
prime_idx++;
});
return parsedList;
}
}
const t5kparser = new T5KParser();
(async () => {
await parser.parse();
/*
Primes: t5kparser.primes
Proofcodes: t5kparser.proofcodes
*/
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment