Skip to content

Instantly share code, notes, and snippets.

#!/usr/bin/env python3
from argparse import ArgumentParser
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
parser = ArgumentParser()
parser.add_argument('-d', '--domain')
@bytearchive
bytearchive / setenv.sh
Created May 14, 2017 09:36 — forked from terrancesnyder/setenv.sh
./setenv.sh - example setenv.sh with defaults set for minimal time spent in garbage collection
#! /bin/sh
# ==================================================================
# ______ __ _____
# /_ __/___ ____ ___ _________ _/ /_ /__ /
# / / / __ \/ __ `__ \/ ___/ __ `/ __/ / /
# / / / /_/ / / / / / / /__/ /_/ / /_ / /
#/_/ \____/_/ /_/ /_/\___/\__,_/\__/ /_/
# Multi-instance Apache Tomcat installation with a focus
# on best-practices as defined by Apache, SpringSource, and MuleSoft
import json
import datetime
import pytz
from random import randint
import logging
import time
import redis
main_prefix = "bqueues:"
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""This module's docstring summary line.
This is a multi-line docstring. Paragraphs are separated with blank lines.
Lines conform to 79-column limit.
Module and packages names should be short, lower_case_with_underscores.
Notice that this in not PEP8-cheatsheet.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""This module's docstring summary line.
This is a multi-line docstring. Paragraphs are separated with blank lines.
Lines conform to 79-column limit.
Module and packages names should be short, lower_case_with_underscores.
Notice that this in not PEP8-cheatsheet.py
@bytearchive
bytearchive / chunk_upload.py
Created May 6, 2017 22:05 — forked from nbari/chunk_upload.py
python chunk upload files
#!/usr/bin/env python
import os
import requests
import uuid
from random import randint
from uuid import uuid4
def read_in_chunks(file_object, chunk_size=65536):
while True:
@bytearchive
bytearchive / es-attach-full.py
Created May 5, 2017 23:44 — forked from stevehanson/es-attach-full.py
Interactive Python script to recursively index files in directory tree to elasticSearch using the elasticsearch-mapper-attachments (https://github.com/elasticsearch/elasticsearch-mapper-attachments) plugin to index files (pdf, docx, html, etc).
import os
import sys
# constants, configure to match your environment
HOST = 'http://localhost:9200'
INDEX = 'test'
TYPE = 'attachment'
TMP_FILE_NAME = 'tmp.json'
# for supported formats, see apache tika - http://tika.apache.org/1.4/formats.html
INDEX_FILE_TYPES = ['html','pdf', 'doc', 'docx', 'xls', 'xlsx', 'xml']
@bytearchive
bytearchive / cmd.sh
Created March 25, 2017 03:32 — forked from kelvinn/cmd.sh
Example of using Apache Bench (ab) to POST JSON to an API
# post_loc.txt contains the json you want to post
# -p means to POST it
# -H adds an Auth header (could be Basic or Token)
# -T sets the Content-Type
# -c is concurrent clients
# -n is the number of requests to run in the test
ab -p post_loc.txt -T application/json -H 'Authorization: Token abcd1234' -c 10 -n 2000 http://example.com/api/v1/locations/
@bytearchive
bytearchive / tmux-cheatsheet.markdown
Created January 2, 2017 19:17 — forked from MohamedAlaa/tmux-cheatsheet.markdown
tmux shortcuts & cheatsheet

tmux shortcuts & cheatsheet

start new:

tmux

start new with session name:

tmux new -s myname
@bytearchive
bytearchive / .gitignore
Created December 26, 2016 07:11 — forked from adamgit/.gitignore
.gitignore file for Xcode4 / OS X Source projects
#########################
# .gitignore file for Xcode4 and Xcode5 Source projects
#
# Apple bugs, waiting for Apple to fix/respond:
#
# 15564624 - what does the xccheckout file in Xcode5 do? Where's the documentation?
#
# Version 2.6
# For latest version, see: http://stackoverflow.com/questions/49478/git-ignore-file-for-xcode-projects
#