These are a list of usages of shell commands I can't live without on UNIX-based systems.
Using Homebrew (yes, I am opinionated) you can install the following tools with the following packages:
#!/bin/bash | |
# Rename the ouput html file from redditPostArchiver with the reddit thread title. | |
# https://github.com/sJohnsonStoever/redditPostArchiver | |
for f in *.html; | |
do | |
title=$( awk 'BEGIN{IGNORECASE=1;FS="<title>|</title>";RS=EOF} {print $2}' "$f" ) | |
mv -i "$f" "${title//[^a-zA-Z0-9\._\- ]}_$f" |
require 'forwardable' | |
module RtmpMeta | |
class Parser | |
PATTERN = /duration\s+(?<duration>\d+\.?\d+)$/ | |
attr_reader :raw_data | |
def initialize raw_data | |
@raw_data = raw_data | |
end |
#!/usr/bin/env python | |
# | |
#require: https://github.com/richardasaurus/mega.py | |
# | |
import os | |
import sys | |
from mega import Mega | |
mega = Mega({'verbose': True}) | |
m = mega.login('megauseremail', 'megapass') |
from scrapy import log | |
from scrapy.item import Item | |
from scrapy.http import Request | |
from scrapy.contrib.spiders import XMLFeedSpider | |
def NextURL(): | |
""" | |
Generate a list of URLs to crawl. You can query a database or come up with some other means | |
Note that if you generate URLs to crawl from a scraped URL then you're better of using a |
#!/bin/bash | |
# simple function to check http response code before downloading a remote file | |
# example usage: | |
# if `validate_url $url >/dev/null`; then dosomething; else echo "does not exist"; fi | |
function validate_url(){ | |
if [[ `wget -S --spider $1 2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then echo "true"; fi | |
} |
#!/bin/bash | |
HOME="http://www.yourdomain.com/some/page" | |
DOMAINS="yourdomain.com" | |
DEPTH=2 | |
OUTPUT="./urls.csv" | |
wget -r --spider --delete-after --force-html -D "$DOMAINS" -l $DEPTH "$HOME" 2>&1 \ | |
| grep '^--' | awk '{ print $3 }' | grep -v '\. \(css\|js\|png\|gif\|jpg\)$' | sort | uniq > $OUTPUT |
wget --spider -o wget.log -e robots=off --wait 3 -r -p -S http:// | |
grep -ri 'http://' wget.log | grep -E -v '(files/|\.jpg|\.jpeg|\.gif|\.css|\.js|\.pdf|\.png|\.xls)' | awk '{print $3}'|sort|uniq|sort > site_map.txt | |
cat $1 |grep -i -E -v '(\.jpg|\.jpeg|\.gif|\.css|\.js|\.pdf|\.png|\.xls|\.ico|\.txt|\.doc|yandexbot|googlebot|YandexDirect|\/upload\/|" 404 |" 301 |" 302 )'|perl -MURI::Escape -lne 'print uri_unescape($_)'|grep yandsearch|awk '{print $1}'|sort|uniq|wc -l |
http://addictivecode.org/FrequentlyAskedQuestions | |
To spider a site as a logged-in user: | |
1. post the form data (_every_ input with a name in the form, even if it doesn't have a value) required to log in (--post-data). | |
2. save the cookies that get generated (--save-cookies), including session cookies (--keep-session-cookies), which are not saved when --save-cookies alone is specified. | |
2. load the cookies, continue saving the session cookies, and recursively (-r) spider (--spider) the site, ignoring (-R) /logout. | |
# log in and save the cookies | |
wget --post-data='username=my_username&password=my_password&next=' --save-cookies=cookies.txt --keep-session-cookies https://foobar.com/login |
DOMAIN="m.bbc.co.uk"
SERVICE="hindi"
HTTP_USER_AGENT="Mozilla/5.0 (iPhone; Mobile; AppleWebKit; Safari)"
EXCLUDE_EXTENSIONS="\.\(txt\|css\|js\|png\|gif\|jpg\)$"
MAX_DEPTH="3"
wget --spider --no-directories --no-parent --force-html --recursive \
--level=$MAX_DEPTH --no-clobber \