Skip to content

Instantly share code, notes, and snippets.

View danemacaulay's full-sized avatar

Dane Macaulay danemacaulay

View GitHub Profile
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import scipy as sp
posts = pd.read_csv('posts.csv')
# Create vectorizer for function to use
vectorizer = CountVectorizer(binary=False)
y = posts["score"].values.astype(np.float32)
@danemacaulay
danemacaulay / element-selector.js
Last active March 15, 2020 10:33
get unique css selector by element
'use strict';
function attributeIterator(node, extractorFunction) {
var selector = '';
var attributes = node.attributes;
for (var i = 0; i < attributes.length; i++) {
var attr = attributes[i];
var attrSelector = extractorFunction(attr);
if (attrSelector) {
selector += attrSelector;
from functools import wraps
import time
import inspect
def timing(f):
@wraps(f)
def wrapper(*args, **kwargs):
module = inspect.getmodule(f).__name__
start = time.time()
result = f(*args, **kwargs)
@danemacaulay
danemacaulay / proxy.js
Created December 17, 2018 03:48
node proxy
const port = process.argv[2] || 7979
const targetPort = process.argv[3] || 5000
const path = process.argv[4] || 'client'
const express = require('express')
const proxy = require('http-proxy-middleware')
const app = express()
app.use(express.static(path))
app.use('/services', proxy({target: `http://0.0.0.0:${targetPort}`, changeOrigin: true}))
app.listen(port, () => console.log(`Listening at http://localhost:${port}`))
function getAccumulatedList(list) {
let accumulatedList = []
list.reduce((accumulator, item) => {
accumulatedList.push(accumulator)
return item + accumulator
}, 0)
return list.reduce((accumulator, item) => {
accumulator.count = item + accumulator.count
accumulator.acclist.push(accumulator.count)
return accumulator
@danemacaulay
danemacaulay / scheduler.py
Last active November 30, 2018 22:03
python background scheduler
import time
import threading
import datetime
class Scheduler(object):
def __init__(self, hours, job):
self.interval = datetime.timedelta(hours=hours).seconds
self.job = job
thread = threading.Thread(target=self.run, args=())
thread.daemon = True
import ast
from Crypto.Cipher import PKCS1_OAEP
from Crypto.PublicKey import RSA
with open('cert/referral', 'rb') as f: key_text = f.read()
privkey = RSA.importKey(key_text)
publickey = privkey.publickey()
encryptor = PKCS1_OAEP.new(publickey)
decryptor = PKCS1_OAEP.new(privkey)
def encrypt(msg):
@danemacaulay
danemacaulay / indexer.py
Last active January 4, 2018 19:11
Stream through remote common crawl index file to search for WARC entries by URL
import sys
import requests
import zlib
import json
from urllib.parse import urlparse
from collections import Counter
path = sys.argv[1]
url = 'https://commoncrawl.s3.amazonaws.com/{}'.format(path)
google_netloc = 'www.google.com'
google_path = '/maps/place'
@danemacaulay
danemacaulay / warc_fetcher.py
Created January 4, 2018 15:19
Direct to STDOUT all warc data on a particular domain using index.commoncrawl.org
import gzip
import json
import requests
from StringIO import StringIO
def get_page_count(searchString):
url = 'http://index.commoncrawl.org/CC-MAIN-2017-51-index?url={}&output=json&showNumPages=true'.format(searchString)
resp = requests.get(url)
return json.loads(resp.content)['pages']
Set<String> phones = new HashSet<>();
PhoneNumberUtil util = PhoneNumberUtil.getInstance();
Iterator<PhoneNumberMatch> iterator = util.findNumbers(source, null).iterator();
while (iterator.hasNext()) {
phones.add(iterator.next().rawString());
}