Skip to content

Instantly share code, notes, and snippets.

@N0taN3rd
N0taN3rd / chromeCrawling.js
Created May 5, 2017 16:36
use chrome to crawl
const CDP = require('chrome-remote-interface')
const Promise = require('bluebird')
const util = require('util')
const path = require('path')
const fs = require('fs-extra')
const filenameify = require('filenamify-url')
const JsDetector = require('./lib/jsDetector')
const detectLib = require('./lib/detectlibs')
const {apps, categories} = fs.readJsonSync('./lib/apps.json')
@N0taN3rd
N0taN3rd / html_rewriter.py
Created April 22, 2017 03:32
pywb proxy mods
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
from six.moves.html_parser import HTMLParser
from six.moves.urllib.parse import urljoin, urlsplit, urlunsplit
@N0taN3rd
N0taN3rd / typescript-snippets.ts
Created June 10, 2021 18:01
Just some typescript snippets I have come up with that I would like to use elsewhere
/**
* Extracts the type of the items in type T which is an array/set of type U - we want the type U.
* Aka un-boxes the type U in typescripts generic array T = Array<U>.
* If the supplied type is not an array then that type is returned as is
* See the example for more details
*
* @example
* type ArrayItemType = Unboxed<string[]>
* // ArrayItemType = string
*
@N0taN3rd
N0taN3rd / yt.IA.20170727020348.html
Last active April 13, 2020 17:20
Youtube switched over to Polymer (web components) and Heritrix has issues with knowing about HTML5 elements and web components
<!DOCTYPE html><html lang="en" data-cast-api-enabled="true"><head><script type="text/javascript" src="/static/js/analytics.js?v=1500596387.0" charset="utf-8"></script>
<script type="text/javascript">archive_analytics.values.service='wb';archive_analytics.values.server_name='wwwb-app16.us.archive.org';archive_analytics.values.server_ms=1692;</script><script type="text/javascript" src="/static/js/wbhack.js?v=1500596387.0" charset="utf-8"></script>
<script type="text/javascript">
__wbhack.init('https://web.archive.org/web');
</script>
<link rel="stylesheet" type="text/css" href="/static/css/banner-styles.css?v=1500596387.0" />
<link rel="stylesheet" type="text/css" href="/static/css/iconochive.css?v=1500596387.0" />
@N0taN3rd
N0taN3rd / index.html
Created October 23, 2018 00:13
something
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Still Drinking Gin And Juice</title>
<style>
html, body {
height: 100%;
margin: 0;
padding: 0;
border: 0;
var url = `${location.protocol}/${wbinfo.proxy_magic}/proxy-fetch/https://www.google.com`
var res = await fetch(url)
console.log(await res.text())
@N0taN3rd
N0taN3rd / README.md
Last active August 19, 2018 13:37
Demo Puppetter And Node.js Chrome Headless Control For Alexander Nwala

Requires

Node.js download

Usage

  1. npm install
  2. npm run go

Other Info

Comments in js file and have fun!

@N0taN3rd
N0taN3rd / gevent-import.txt
Created July 9, 2018 22:49
PYTHONPATH=<path to pywb dir>pywb/ python -v -u pywb/apps/cli.py --live &> out.txt
# /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/__init__.cpython-36.pyc matches /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__init__.py
# code object from '/home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/__init__.cpython-36.pyc'
# /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/_config.cpython-36.pyc matches /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/_config.py
# code object from '/home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/_config.cpython-36.pyc'
# /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/_compat.cpython-36.pyc matches /home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/_compat.py
# code object from '/home/john/PycharmProjects/pywb/.venv/lib/python3.6/site-packages/gevent/__pycache__/_compat.cpython-36.pyc'
import 'gevent._compat' # <_
@N0taN3rd
N0taN3rd / dumpWarc.js
Created June 18, 2018 18:00
Dump warc
const Parser = require('node-warc')
const filenamifyURL = require('filenamify-url')
const fs = require('fs-extra')
const path = require('path')
const parser = new Parser('<path-to-warcfile>')
class WARCMap {
constructor () {
this._requests = new Map()
from pywb.warcserver.index.cdxobject import CDXObject
def read_cdxj(path):
with open(path, 'rb') as cdxjin:
for line in cdxjin:
cdx = CDXObject(line)
if 'html' in cdx.get('mime') and "200" == cdx.get('status'):
print(cdx.get('url'))