N0taN3rd/dumpWarc.js

## dump_warc.py
"""Not working 100%"""
import re
from os.path import basename
from collections import defaultdict, Counter
from itertools import zip_longest
from pathlib import Path
from typing import DefaultDict, Optional, Union, List

from goldfinch import validFileName as vfn
from warcio.archiveiterator import WARCIterator
from warcio.recordloader import ArcWarcRecord
from yarl import URL


def find_warc(start_path: str = ".") -> Optional[Path]:
    for file in Path(start_path).iterdir():
        if file.name.endswith(".warc"):
            return file
    return None


def filenamify_url(it) -> str:
    return vfn(it.replace("://", "_").replace("/", "_"), initCap=False).decode("utf-8")


def filenamify_content_type(it) -> str:
    return vfn(it.replace(";", "_").replace("/", "_"), initCap=False).decode("utf-8")


def get_target_uri(record: ArcWarcRecord) -> str:
    return record.rec_headers.get("WARC-Target-URI")


def get_content_type(record: ArcWarcRecord) -> str:
    return record.http_headers.get("Content-Type")


def get_method(record: ArcWarcRecord) -> str:
    return record.http_headers.protocol


def is_get(record: ArcWarcRecord) -> bool:
    return record.http_headers.protocol == "GET"


def ok(record: ArcWarcRecord) -> bool:
    return record.http_headers.get("status") == "200"


def get_content(record: ArcWarcRecord) -> str:
    return record.raw_stream.read()


def get_warc_record_id(record: ArcWarcRecord) -> str:
    return record.rec_headers.get("WARC-Record-ID")


def is_html_img_css_js(content_type: str) -> bool:
    return (
        "html" in content_type
        or "image" in content_type
        or "css" in content_type
        or "javascript" in content_type
    )


def get_file_ext(content_type: str) -> str:
    if "html" in content_type:
        return ".html"
    elif "image" in content_type:
        if "png" in content_type:
            return ".png"
        elif "jpg" in content_type or "jpeg" in content_type:
            return ".jpg"
        elif "gif" in content_type:
            return ".gif"
    elif "css" in content_type:
        return ".css"
    elif "javascript" in content_type:
        return ".js"


class Dumper(object):
    def __init__(self, warc_p: Optional[Union[str, Path]]) -> None:
        self.warc_p: Optional[Union[str, Path]] = warc_p
        self._dump_dir: Optional[Path] = None
        self._inited: bool = False
        self._requests: DefaultDict[str, List[ArcWarcRecord]] = defaultdict(list)
        self._responses: DefaultDict[
            str, List[(ArcWarcRecord, Union[str, bytes])]
        ] = defaultdict(list)
        self._dup_counter = Counter()

    def init(self):
        self._inited = True
        if not isinstance(self.warc_p, Path):
            self.warc_p = Path(self.warc_p)
        elif not self.warc_p:
            raise Exception("You did not supply a path to a warc file")

        with self.warc_p.open("rb") as warcin:
            for record in WARCIterator(warcin, ensure_http_headers=True):
                rect = record.rec_type
                if rect == "metadata":
                    url = record.rec_headers.get("WARC-Target-URI")
                    self._dump_dir = Path(filenamify_url(url))
                    if not self._dump_dir.exists():
                        self._dump_dir.mkdir(exist_ok=True)
                elif rect == "request":
                    if is_get(record):
                        url = get_target_uri(record)
                        self._requests[url].append(record)
                elif rect == "response":
                    if ok(record):
                        url = get_target_uri(record)
                        body = record.content_stream().read()
                        self._responses[url].append((record, body))

    def unique_filename_url(self, url: URL, ct: str) -> Optional[Path]:
        ext = get_file_ext(ct)
        fn = filenamify_url(str(url))
        if len(fn) > 100:
            return None
        if not fn.endswith(ext):
            file = "%s%s" % (fn, ext)
        else:
            file = fn
        sp = self._dump_dir / file
        if sp.exists():
            self._dup_counter[file] += 1
            file = "%d__%s%s" % (self._dup_counter[file], fn, ext)
            sp = self._dump_dir / file
        return sp

    def go(self, warc_p: Optional[Union[str, Path]] = None) -> None:
        if warc_p is not None:
            self.warc_p = warc_p
        if not self._inited:
            self.init()
        for url, resz in self._responses.items():
            for res, c in resz:
                rct = get_content_type(res)
                if is_html_img_css_js(rct):
                    url = URL(get_target_uri(res))
                    bn = basename(url.path)
                    if not bn:
                        p = self.unique_filename_url(url, rct)
                        if p is not None:
                            print(res.raw_stream.read())
                            with p.open("wb") as out:
                                print(get_content(res))
                                out.write(c)

                    # elif url.path == "/":
                    #     indexp = self._dump_dir / "index.html"
                    #     with indexp.open('w') as out:
                    #         out.write(get_content(res))
                    # else:
                    #     print(bn, url)

                print(c)


if __name__ == "__main__":
    dumper = Dumper(find_warc())
    dumper.go()

## dumpWarc.js
const Parser = require('node-warc')
const filenamifyURL = require('filenamify-url')
const fs = require('fs-extra')
const path = require('path')

const parser = new Parser('<path-to-warcfile>')

class WARCMap {
  constructor () {
    this._requests = new Map()
    this._responses = new Map()
    this.metadata = null
  }

  get site () {
    return this.metadata.targetURI
  }

  addRequest (record) {
    this._requests.set(record.targetURI, record)
  }

  addResponse (record) {
    let list = this._responses.get(record.targetURI)
    if (!list) {
      list = []
      this._responses.set(record.targetURI, list)
    }
    list.push(record)
  }

  * [Symbol.iterator] () {
    for (const [url, req] of this._requests) {
      if (this._responses.has(url)) {
        yield {url, req, resz: this._responses.get(url)}
      }
    }
  }
}

function getFileExt (contentType) {
  if (contentType.includes('html')) {
    return '.html'
  } else if (contentType.includes('image')) {
    if (contentType.includes('png')) {
      return '.png'
    } else if (contentType.includes('jpg') || contentType.includes('jpeg')) {
      return '.jpg'
    } else if (contentType.includes('gif')) {
      return '.gif'
    }
  } else if (contentType.includes('css')) {
    return '.css'
  } else if (contentType.includes('javascript')) {
    return '.js'
  }
  return null
}

const requestMap = new WARCMap()

function addRecord (record) {
  if (record.type === 'response' && record.statusCode === 200) {
    requestMap.addResponse(record)
  } else if (record.type === 'request' && record.method.toLowerCase() ===
    'get') {
    requestMap.addRequest(record)
  } else if (record.type === 'metadata') {
    requestMap.metadata = record
  }
}

parser.on('record', record => {
  addRecord(record)
})

parser.on('done', async finalRecord => {
  addRecord(finalRecord)
  const dumpDir = filenamifyURL(requestMap.site)
  if (!await fs.pathExists(dumpDir)) {
    await fs.mkdir(dumpDir)
  }
  for (const {url, req, resz} of requestMap) {
    let fn
    try {
      fn = filenamifyURL(url)
    } catch (e) {
      continue
    }
    for (const res of resz) {
      const ext = getFileExt(res.httpHeaders['Content-Type'])
      if (ext) {
        const fp = path.join(dumpDir, `${fn}${ext}`)
        await fs.writeFile(fp, res.bodyBuffer, 'utf8')
      }
    }
  }
})

parser.on('error', error => {
  console.error(error)
})

parser.start()

## package.json
{
  "dependencies": {
    "filenamify-url": "^1.0.0",
    "fs-extra": "^6.0.1",
    "node-warc": "^2.0.0",
  }
}

## requirements.txt
warcio
goldfinch
yarl
	"""Not working 100%"""
	import re
	from os.path import basename
	from collections import defaultdict, Counter
	from itertools import zip_longest
	from pathlib import Path
	from typing import DefaultDict, Optional, Union, List

	from goldfinch import validFileName as vfn
	from warcio.archiveiterator import WARCIterator
	from warcio.recordloader import ArcWarcRecord
	from yarl import URL


	def find_warc(start_path: str = ".") -> Optional[Path]:
	for file in Path(start_path).iterdir():
	if file.name.endswith(".warc"):
	return file
	return None


	def filenamify_url(it) -> str:
	return vfn(it.replace("://", "_").replace("/", "_"), initCap=False).decode("utf-8")


	def filenamify_content_type(it) -> str:
	return vfn(it.replace(";", "_").replace("/", "_"), initCap=False).decode("utf-8")


	def get_target_uri(record: ArcWarcRecord) -> str:
	return record.rec_headers.get("WARC-Target-URI")


	def get_content_type(record: ArcWarcRecord) -> str:
	return record.http_headers.get("Content-Type")


	def get_method(record: ArcWarcRecord) -> str:
	return record.http_headers.protocol


	def is_get(record: ArcWarcRecord) -> bool:
	return record.http_headers.protocol == "GET"


	def ok(record: ArcWarcRecord) -> bool:
	return record.http_headers.get("status") == "200"


	def get_content(record: ArcWarcRecord) -> str:
	return record.raw_stream.read()


	def get_warc_record_id(record: ArcWarcRecord) -> str:
	return record.rec_headers.get("WARC-Record-ID")


	def is_html_img_css_js(content_type: str) -> bool:
	return (
	"html" in content_type
	or "image" in content_type
	or "css" in content_type
	or "javascript" in content_type
	)


	def get_file_ext(content_type: str) -> str:
	if "html" in content_type:
	return ".html"
	elif "image" in content_type:
	if "png" in content_type:
	return ".png"
	elif "jpg" in content_type or "jpeg" in content_type:
	return ".jpg"
	elif "gif" in content_type:
	return ".gif"
	elif "css" in content_type:
	return ".css"
	elif "javascript" in content_type:
	return ".js"


	class Dumper(object):
	def __init__(self, warc_p: Optional[Union[str, Path]]) -> None:
	self.warc_p: Optional[Union[str, Path]] = warc_p
	self._dump_dir: Optional[Path] = None
	self._inited: bool = False
	self._requests: DefaultDict[str, List[ArcWarcRecord]] = defaultdict(list)
	self._responses: DefaultDict[
	str, List[(ArcWarcRecord, Union[str, bytes])]
	] = defaultdict(list)
	self._dup_counter = Counter()

	def init(self):
	self._inited = True
	if not isinstance(self.warc_p, Path):
	self.warc_p = Path(self.warc_p)
	elif not self.warc_p:
	raise Exception("You did not supply a path to a warc file")

	with self.warc_p.open("rb") as warcin:
	for record in WARCIterator(warcin, ensure_http_headers=True):
	rect = record.rec_type
	if rect == "metadata":
	url = record.rec_headers.get("WARC-Target-URI")
	self._dump_dir = Path(filenamify_url(url))
	if not self._dump_dir.exists():
	self._dump_dir.mkdir(exist_ok=True)
	elif rect == "request":
	if is_get(record):
	url = get_target_uri(record)
	self._requests[url].append(record)
	elif rect == "response":
	if ok(record):
	url = get_target_uri(record)
	body = record.content_stream().read()
	self._responses[url].append((record, body))

	def unique_filename_url(self, url: URL, ct: str) -> Optional[Path]:
	ext = get_file_ext(ct)
	fn = filenamify_url(str(url))
	if len(fn) > 100:
	return None
	if not fn.endswith(ext):
	file = "%s%s" % (fn, ext)
	else:
	file = fn
	sp = self._dump_dir / file
	if sp.exists():
	self._dup_counter[file] += 1
	file = "%d__%s%s" % (self._dup_counter[file], fn, ext)
	sp = self._dump_dir / file
	return sp

	def go(self, warc_p: Optional[Union[str, Path]] = None) -> None:
	if warc_p is not None:
	self.warc_p = warc_p
	if not self._inited:
	self.init()
	for url, resz in self._responses.items():
	for res, c in resz:
	rct = get_content_type(res)
	if is_html_img_css_js(rct):
	url = URL(get_target_uri(res))
	bn = basename(url.path)
	if not bn:
	p = self.unique_filename_url(url, rct)
	if p is not None:
	print(res.raw_stream.read())
	with p.open("wb") as out:
	print(get_content(res))
	out.write(c)

	# elif url.path == "/":
	# indexp = self._dump_dir / "index.html"
	# with indexp.open('w') as out:
	# out.write(get_content(res))
	# else:
	# print(bn, url)

	print(c)


	if __name__ == "__main__":
	dumper = Dumper(find_warc())
	dumper.go()
	const Parser = require('node-warc')
	const filenamifyURL = require('filenamify-url')
	const fs = require('fs-extra')
	const path = require('path')

	const parser = new Parser('<path-to-warcfile>')

	class WARCMap {
	constructor () {
	this._requests = new Map()
	this._responses = new Map()
	this.metadata = null
	}

	get site () {
	return this.metadata.targetURI
	}

	addRequest (record) {
	this._requests.set(record.targetURI, record)
	}

	addResponse (record) {
	let list = this._responses.get(record.targetURI)
	if (!list) {
	list = []
	this._responses.set(record.targetURI, list)
	}
	list.push(record)
	}

	* [Symbol.iterator] () {
	for (const [url, req] of this._requests) {
	if (this._responses.has(url)) {
	yield {url, req, resz: this._responses.get(url)}
	}
	}
	}
	}

	function getFileExt (contentType) {
	if (contentType.includes('html')) {
	return '.html'
	} else if (contentType.includes('image')) {
	if (contentType.includes('png')) {
	return '.png'
	} else if (contentType.includes('jpg') \|\| contentType.includes('jpeg')) {
	return '.jpg'
	} else if (contentType.includes('gif')) {
	return '.gif'
	}
	} else if (contentType.includes('css')) {
	return '.css'
	} else if (contentType.includes('javascript')) {
	return '.js'
	}
	return null
	}

	const requestMap = new WARCMap()

	function addRecord (record) {
	if (record.type === 'response' && record.statusCode === 200) {
	requestMap.addResponse(record)
	} else if (record.type === 'request' && record.method.toLowerCase() ===
	'get') {
	requestMap.addRequest(record)
	} else if (record.type === 'metadata') {
	requestMap.metadata = record
	}
	}

	parser.on('record', record => {
	addRecord(record)
	})

	parser.on('done', async finalRecord => {
	addRecord(finalRecord)
	const dumpDir = filenamifyURL(requestMap.site)
	if (!await fs.pathExists(dumpDir)) {
	await fs.mkdir(dumpDir)
	}
	for (const {url, req, resz} of requestMap) {
	let fn
	try {
	fn = filenamifyURL(url)
	} catch (e) {
	continue
	}
	for (const res of resz) {
	const ext = getFileExt(res.httpHeaders['Content-Type'])
	if (ext) {
	const fp = path.join(dumpDir, `${fn}${ext}`)
	await fs.writeFile(fp, res.bodyBuffer, 'utf8')
	}
	}
	}
	})

	parser.on('error', error => {
	console.error(error)
	})

	parser.start()
	{
	"dependencies": {
	"filenamify-url": "^1.0.0",
	"fs-extra": "^6.0.1",
	"node-warc": "^2.0.0",
	}
	}