Skip to content

Instantly share code, notes, and snippets.

@N0taN3rd
Created June 18, 2018 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save N0taN3rd/2507fb8a03f3dffeaf0859f321be5709 to your computer and use it in GitHub Desktop.
Save N0taN3rd/2507fb8a03f3dffeaf0859f321be5709 to your computer and use it in GitHub Desktop.
Dump warc
"""Not working 100%"""
import re
from os.path import basename
from collections import defaultdict, Counter
from itertools import zip_longest
from pathlib import Path
from typing import DefaultDict, Optional, Union, List
from goldfinch import validFileName as vfn
from warcio.archiveiterator import WARCIterator
from warcio.recordloader import ArcWarcRecord
from yarl import URL
def find_warc(start_path: str = ".") -> Optional[Path]:
for file in Path(start_path).iterdir():
if file.name.endswith(".warc"):
return file
return None
def filenamify_url(it) -> str:
return vfn(it.replace("://", "_").replace("/", "_"), initCap=False).decode("utf-8")
def filenamify_content_type(it) -> str:
return vfn(it.replace(";", "_").replace("/", "_"), initCap=False).decode("utf-8")
def get_target_uri(record: ArcWarcRecord) -> str:
return record.rec_headers.get("WARC-Target-URI")
def get_content_type(record: ArcWarcRecord) -> str:
return record.http_headers.get("Content-Type")
def get_method(record: ArcWarcRecord) -> str:
return record.http_headers.protocol
def is_get(record: ArcWarcRecord) -> bool:
return record.http_headers.protocol == "GET"
def ok(record: ArcWarcRecord) -> bool:
return record.http_headers.get("status") == "200"
def get_content(record: ArcWarcRecord) -> str:
return record.raw_stream.read()
def get_warc_record_id(record: ArcWarcRecord) -> str:
return record.rec_headers.get("WARC-Record-ID")
def is_html_img_css_js(content_type: str) -> bool:
return (
"html" in content_type
or "image" in content_type
or "css" in content_type
or "javascript" in content_type
)
def get_file_ext(content_type: str) -> str:
if "html" in content_type:
return ".html"
elif "image" in content_type:
if "png" in content_type:
return ".png"
elif "jpg" in content_type or "jpeg" in content_type:
return ".jpg"
elif "gif" in content_type:
return ".gif"
elif "css" in content_type:
return ".css"
elif "javascript" in content_type:
return ".js"
class Dumper(object):
def __init__(self, warc_p: Optional[Union[str, Path]]) -> None:
self.warc_p: Optional[Union[str, Path]] = warc_p
self._dump_dir: Optional[Path] = None
self._inited: bool = False
self._requests: DefaultDict[str, List[ArcWarcRecord]] = defaultdict(list)
self._responses: DefaultDict[
str, List[(ArcWarcRecord, Union[str, bytes])]
] = defaultdict(list)
self._dup_counter = Counter()
def init(self):
self._inited = True
if not isinstance(self.warc_p, Path):
self.warc_p = Path(self.warc_p)
elif not self.warc_p:
raise Exception("You did not supply a path to a warc file")
with self.warc_p.open("rb") as warcin:
for record in WARCIterator(warcin, ensure_http_headers=True):
rect = record.rec_type
if rect == "metadata":
url = record.rec_headers.get("WARC-Target-URI")
self._dump_dir = Path(filenamify_url(url))
if not self._dump_dir.exists():
self._dump_dir.mkdir(exist_ok=True)
elif rect == "request":
if is_get(record):
url = get_target_uri(record)
self._requests[url].append(record)
elif rect == "response":
if ok(record):
url = get_target_uri(record)
body = record.content_stream().read()
self._responses[url].append((record, body))
def unique_filename_url(self, url: URL, ct: str) -> Optional[Path]:
ext = get_file_ext(ct)
fn = filenamify_url(str(url))
if len(fn) > 100:
return None
if not fn.endswith(ext):
file = "%s%s" % (fn, ext)
else:
file = fn
sp = self._dump_dir / file
if sp.exists():
self._dup_counter[file] += 1
file = "%d__%s%s" % (self._dup_counter[file], fn, ext)
sp = self._dump_dir / file
return sp
def go(self, warc_p: Optional[Union[str, Path]] = None) -> None:
if warc_p is not None:
self.warc_p = warc_p
if not self._inited:
self.init()
for url, resz in self._responses.items():
for res, c in resz:
rct = get_content_type(res)
if is_html_img_css_js(rct):
url = URL(get_target_uri(res))
bn = basename(url.path)
if not bn:
p = self.unique_filename_url(url, rct)
if p is not None:
print(res.raw_stream.read())
with p.open("wb") as out:
print(get_content(res))
out.write(c)
# elif url.path == "/":
# indexp = self._dump_dir / "index.html"
# with indexp.open('w') as out:
# out.write(get_content(res))
# else:
# print(bn, url)
print(c)
if __name__ == "__main__":
dumper = Dumper(find_warc())
dumper.go()
const Parser = require('node-warc')
const filenamifyURL = require('filenamify-url')
const fs = require('fs-extra')
const path = require('path')
const parser = new Parser('<path-to-warcfile>')
class WARCMap {
constructor () {
this._requests = new Map()
this._responses = new Map()
this.metadata = null
}
get site () {
return this.metadata.targetURI
}
addRequest (record) {
this._requests.set(record.targetURI, record)
}
addResponse (record) {
let list = this._responses.get(record.targetURI)
if (!list) {
list = []
this._responses.set(record.targetURI, list)
}
list.push(record)
}
* [Symbol.iterator] () {
for (const [url, req] of this._requests) {
if (this._responses.has(url)) {
yield {url, req, resz: this._responses.get(url)}
}
}
}
}
function getFileExt (contentType) {
if (contentType.includes('html')) {
return '.html'
} else if (contentType.includes('image')) {
if (contentType.includes('png')) {
return '.png'
} else if (contentType.includes('jpg') || contentType.includes('jpeg')) {
return '.jpg'
} else if (contentType.includes('gif')) {
return '.gif'
}
} else if (contentType.includes('css')) {
return '.css'
} else if (contentType.includes('javascript')) {
return '.js'
}
return null
}
const requestMap = new WARCMap()
function addRecord (record) {
if (record.type === 'response' && record.statusCode === 200) {
requestMap.addResponse(record)
} else if (record.type === 'request' && record.method.toLowerCase() ===
'get') {
requestMap.addRequest(record)
} else if (record.type === 'metadata') {
requestMap.metadata = record
}
}
parser.on('record', record => {
addRecord(record)
})
parser.on('done', async finalRecord => {
addRecord(finalRecord)
const dumpDir = filenamifyURL(requestMap.site)
if (!await fs.pathExists(dumpDir)) {
await fs.mkdir(dumpDir)
}
for (const {url, req, resz} of requestMap) {
let fn
try {
fn = filenamifyURL(url)
} catch (e) {
continue
}
for (const res of resz) {
const ext = getFileExt(res.httpHeaders['Content-Type'])
if (ext) {
const fp = path.join(dumpDir, `${fn}${ext}`)
await fs.writeFile(fp, res.bodyBuffer, 'utf8')
}
}
}
})
parser.on('error', error => {
console.error(error)
})
parser.start()
{
"dependencies": {
"filenamify-url": "^1.0.0",
"fs-extra": "^6.0.1",
"node-warc": "^2.0.0",
}
}
warcio
goldfinch
yarl
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment