Created
June 18, 2018 18:00
-
-
Save N0taN3rd/2507fb8a03f3dffeaf0859f321be5709 to your computer and use it in GitHub Desktop.
Dump warc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Not working 100%""" | |
import re | |
from os.path import basename | |
from collections import defaultdict, Counter | |
from itertools import zip_longest | |
from pathlib import Path | |
from typing import DefaultDict, Optional, Union, List | |
from goldfinch import validFileName as vfn | |
from warcio.archiveiterator import WARCIterator | |
from warcio.recordloader import ArcWarcRecord | |
from yarl import URL | |
def find_warc(start_path: str = ".") -> Optional[Path]: | |
for file in Path(start_path).iterdir(): | |
if file.name.endswith(".warc"): | |
return file | |
return None | |
def filenamify_url(it) -> str: | |
return vfn(it.replace("://", "_").replace("/", "_"), initCap=False).decode("utf-8") | |
def filenamify_content_type(it) -> str: | |
return vfn(it.replace(";", "_").replace("/", "_"), initCap=False).decode("utf-8") | |
def get_target_uri(record: ArcWarcRecord) -> str: | |
return record.rec_headers.get("WARC-Target-URI") | |
def get_content_type(record: ArcWarcRecord) -> str: | |
return record.http_headers.get("Content-Type") | |
def get_method(record: ArcWarcRecord) -> str: | |
return record.http_headers.protocol | |
def is_get(record: ArcWarcRecord) -> bool: | |
return record.http_headers.protocol == "GET" | |
def ok(record: ArcWarcRecord) -> bool: | |
return record.http_headers.get("status") == "200" | |
def get_content(record: ArcWarcRecord) -> str: | |
return record.raw_stream.read() | |
def get_warc_record_id(record: ArcWarcRecord) -> str: | |
return record.rec_headers.get("WARC-Record-ID") | |
def is_html_img_css_js(content_type: str) -> bool: | |
return ( | |
"html" in content_type | |
or "image" in content_type | |
or "css" in content_type | |
or "javascript" in content_type | |
) | |
def get_file_ext(content_type: str) -> str: | |
if "html" in content_type: | |
return ".html" | |
elif "image" in content_type: | |
if "png" in content_type: | |
return ".png" | |
elif "jpg" in content_type or "jpeg" in content_type: | |
return ".jpg" | |
elif "gif" in content_type: | |
return ".gif" | |
elif "css" in content_type: | |
return ".css" | |
elif "javascript" in content_type: | |
return ".js" | |
class Dumper(object): | |
def __init__(self, warc_p: Optional[Union[str, Path]]) -> None: | |
self.warc_p: Optional[Union[str, Path]] = warc_p | |
self._dump_dir: Optional[Path] = None | |
self._inited: bool = False | |
self._requests: DefaultDict[str, List[ArcWarcRecord]] = defaultdict(list) | |
self._responses: DefaultDict[ | |
str, List[(ArcWarcRecord, Union[str, bytes])] | |
] = defaultdict(list) | |
self._dup_counter = Counter() | |
def init(self): | |
self._inited = True | |
if not isinstance(self.warc_p, Path): | |
self.warc_p = Path(self.warc_p) | |
elif not self.warc_p: | |
raise Exception("You did not supply a path to a warc file") | |
with self.warc_p.open("rb") as warcin: | |
for record in WARCIterator(warcin, ensure_http_headers=True): | |
rect = record.rec_type | |
if rect == "metadata": | |
url = record.rec_headers.get("WARC-Target-URI") | |
self._dump_dir = Path(filenamify_url(url)) | |
if not self._dump_dir.exists(): | |
self._dump_dir.mkdir(exist_ok=True) | |
elif rect == "request": | |
if is_get(record): | |
url = get_target_uri(record) | |
self._requests[url].append(record) | |
elif rect == "response": | |
if ok(record): | |
url = get_target_uri(record) | |
body = record.content_stream().read() | |
self._responses[url].append((record, body)) | |
def unique_filename_url(self, url: URL, ct: str) -> Optional[Path]: | |
ext = get_file_ext(ct) | |
fn = filenamify_url(str(url)) | |
if len(fn) > 100: | |
return None | |
if not fn.endswith(ext): | |
file = "%s%s" % (fn, ext) | |
else: | |
file = fn | |
sp = self._dump_dir / file | |
if sp.exists(): | |
self._dup_counter[file] += 1 | |
file = "%d__%s%s" % (self._dup_counter[file], fn, ext) | |
sp = self._dump_dir / file | |
return sp | |
def go(self, warc_p: Optional[Union[str, Path]] = None) -> None: | |
if warc_p is not None: | |
self.warc_p = warc_p | |
if not self._inited: | |
self.init() | |
for url, resz in self._responses.items(): | |
for res, c in resz: | |
rct = get_content_type(res) | |
if is_html_img_css_js(rct): | |
url = URL(get_target_uri(res)) | |
bn = basename(url.path) | |
if not bn: | |
p = self.unique_filename_url(url, rct) | |
if p is not None: | |
print(res.raw_stream.read()) | |
with p.open("wb") as out: | |
print(get_content(res)) | |
out.write(c) | |
# elif url.path == "/": | |
# indexp = self._dump_dir / "index.html" | |
# with indexp.open('w') as out: | |
# out.write(get_content(res)) | |
# else: | |
# print(bn, url) | |
print(c) | |
if __name__ == "__main__": | |
dumper = Dumper(find_warc()) | |
dumper.go() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Parser = require('node-warc') | |
const filenamifyURL = require('filenamify-url') | |
const fs = require('fs-extra') | |
const path = require('path') | |
const parser = new Parser('<path-to-warcfile>') | |
class WARCMap { | |
constructor () { | |
this._requests = new Map() | |
this._responses = new Map() | |
this.metadata = null | |
} | |
get site () { | |
return this.metadata.targetURI | |
} | |
addRequest (record) { | |
this._requests.set(record.targetURI, record) | |
} | |
addResponse (record) { | |
let list = this._responses.get(record.targetURI) | |
if (!list) { | |
list = [] | |
this._responses.set(record.targetURI, list) | |
} | |
list.push(record) | |
} | |
* [Symbol.iterator] () { | |
for (const [url, req] of this._requests) { | |
if (this._responses.has(url)) { | |
yield {url, req, resz: this._responses.get(url)} | |
} | |
} | |
} | |
} | |
function getFileExt (contentType) { | |
if (contentType.includes('html')) { | |
return '.html' | |
} else if (contentType.includes('image')) { | |
if (contentType.includes('png')) { | |
return '.png' | |
} else if (contentType.includes('jpg') || contentType.includes('jpeg')) { | |
return '.jpg' | |
} else if (contentType.includes('gif')) { | |
return '.gif' | |
} | |
} else if (contentType.includes('css')) { | |
return '.css' | |
} else if (contentType.includes('javascript')) { | |
return '.js' | |
} | |
return null | |
} | |
const requestMap = new WARCMap() | |
function addRecord (record) { | |
if (record.type === 'response' && record.statusCode === 200) { | |
requestMap.addResponse(record) | |
} else if (record.type === 'request' && record.method.toLowerCase() === | |
'get') { | |
requestMap.addRequest(record) | |
} else if (record.type === 'metadata') { | |
requestMap.metadata = record | |
} | |
} | |
parser.on('record', record => { | |
addRecord(record) | |
}) | |
parser.on('done', async finalRecord => { | |
addRecord(finalRecord) | |
const dumpDir = filenamifyURL(requestMap.site) | |
if (!await fs.pathExists(dumpDir)) { | |
await fs.mkdir(dumpDir) | |
} | |
for (const {url, req, resz} of requestMap) { | |
let fn | |
try { | |
fn = filenamifyURL(url) | |
} catch (e) { | |
continue | |
} | |
for (const res of resz) { | |
const ext = getFileExt(res.httpHeaders['Content-Type']) | |
if (ext) { | |
const fp = path.join(dumpDir, `${fn}${ext}`) | |
await fs.writeFile(fp, res.bodyBuffer, 'utf8') | |
} | |
} | |
} | |
}) | |
parser.on('error', error => { | |
console.error(error) | |
}) | |
parser.start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"dependencies": { | |
"filenamify-url": "^1.0.0", | |
"fs-extra": "^6.0.1", | |
"node-warc": "^2.0.0", | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
warcio | |
goldfinch | |
yarl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment