Skip to content

Instantly share code, notes, and snippets.

@anjackson
Created October 2, 2018 23:24
Show Gist options
  • Save anjackson/6f89e18e17765930b30bf33b742209f3 to your computer and use it in GitHub Desktop.
Save anjackson/6f89e18e17765930b30bf33b742209f3 to your computer and use it in GitHub Desktop.
Checking CC WARC
Processing record for url null from test file @0 Avail 503 length 503
WARC-Payload-Digest: null
WARC-Block-Digest: null
Length was: 503
SHA1 was: NRRSOBRU4N7OJZRFLQUJF7GFSJKV6GB4
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @482 Avail 278 length 278
WARC-Payload-Digest: null
WARC-Block-Digest: null
Length was: 278
SHA1 was: DVBEEOF4QRZRUSQADENDSU3AG7U7Y2TZ
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @922 Avail 106253 length 106253
WARC-Payload-Digest: sha1:RXO5ZZVAQRV37P6ZZUHD2WRBVPS4NRSP
WARC-Block-Digest: sha1:DAA7ZUHEZGVJAOWZFYHO6FQVAJV25E2T
Length was: 106253
SHA1 was: DAA7ZUHEZGVJAOWZFYHO6FQVAJV25E2T
Processing record for url http://003sh.ou-net.com/blog/?p=7548 from test file @17669 Avail 292 length 292
WARC-Payload-Digest: null
WARC-Block-Digest: null
Length was: 292
SHA1 was: 3ZOKODHDJ27TF7CAYGASXVVQSCNY34YI
/**
*
*/
package uk.bl.wa.analyser;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Iterator;
import org.apache.poi.util.IOUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.util.Base32;
import uk.bl.wa.util.Normalisation;
/**
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class SimpleWARCAnalyser {
/**
* @param args
* @throws IOException
* @throws MalformedURLException
* @throws NoSuchAlgorithmException
*/
public static void main(String[] args)
throws MalformedURLException, IOException,
NoSuchAlgorithmException {
ArchiveReader reader = ArchiveReaderFactory.get("/Users/andy/Documents/workspace/warc-discovery/CC-MAIN-20180814062251-20180814082251-00000.warc.gz");
Iterator<ArchiveRecord> ir = reader.iterator();
int recordCount = 0;
int lastFailedRecord = 0;
// Iterate though each record in the WARC file
while (ir.hasNext() && recordCount < 4) {
ArchiveRecord rec = null;
try {
rec = ir.next();
} catch (RuntimeException e) {
System.err.println("Exception on record after rec " + recordCount + " from test file. " + e);
if (lastFailedRecord != recordCount) {
lastFailedRecord = recordCount;
continue;
}
System.err.println(
"Failed to reach next record, last record already on error - skipping the rest of the records");
break;
}
final String url = Normalisation.sanitiseWARCHeaderValue(rec.getHeader().getUrl());
System.out.println("\n\nProcessing record for url " + url
+ " from test file" + " @"
+ rec.getHeader().getOffset() + " Avail "
+ rec.available() + " length "
+ rec.getHeader().getContentLength()
+ "\nWARC-Payload-Digest: "
+ rec.getHeader().getHeaderValue("WARC-Payload-Digest")
+ "\nWARC-Block-Digest: "
+ rec.getHeader().getHeaderValue("WARC-Block-Digest"));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(rec, bos);
System.err.println("Length was: " + bos.size());
System.err.println("SHA1 was: " + Base32.encode(MessageDigest
.getInstance("SHA1").digest(bos.toByteArray())));
recordCount += 1;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment