jadedgnome

## rename.sh
#!/bin/bash

# Rename the ouput html file from redditPostArchiver with the reddit thread title.
# https://github.com/sJohnsonStoever/redditPostArchiver


for f in *.html;
   do
   title=$( awk 'BEGIN{IGNORECASE=1;FS="<title>|</title>";RS=EOF} {print $2}' "$f" )
   mv -i "$f" "${title//[^a-zA-Z0-9\._\- ]}_$f"

## rtmp_meta.rb
require 'forwardable'

module RtmpMeta
  class Parser
    PATTERN = /duration\s+(?<duration>\d+\.?\d+)$/
    attr_reader :raw_data

    def initialize raw_data
      @raw_data = raw_data
    end

## mega-upload.py
#!/usr/bin/env python
#
#require: https://github.com/richardasaurus/mega.py
#
import os
import sys
from mega import Mega

mega = Mega({'verbose': True})
m = mega.login('megauseremail', 'megapass')

## README.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jadedgnome
                / README.md
            
            
              Last active
              August 29, 2015 14:13
                — forked from mbbx6spp/README.md
            
          
    Best UNIX Shell tools

These are a list of usages of shell commands I can't live without on UNIX-based systems.
Install

Mac OS X

Using Homebrew (yes, I am opinionated) you can install the following tools with the following packages:

  
## gist:6a72f50ff24aac30d1fe
from scrapy import log
from scrapy.item import Item
from scrapy.http import Request
from scrapy.contrib.spiders import XMLFeedSpider


def NextURL():
    """
    Generate a list of URLs to crawl. You can query a database or come up with some other means
    Note that if you generate URLs to crawl from a scraped URL then you're better of using a

## validate.sh
#!/bin/bash

# simple function to check http response code before downloading a remote file
# example usage:
# if `validate_url $url >/dev/null`; then dosomething; else echo "does not exist"; fi


function validate_url(){
  if [[ `wget -S --spider $1  2>&1 | grep 'HTTP/1.1 200 OK'` ]]; then echo "true"; fi
}

## spider.sh
#!/bin/bash

HOME="http://www.yourdomain.com/some/page"
DOMAINS="yourdomain.com"
DEPTH=2
OUTPUT="./urls.csv"

wget -r --spider --delete-after --force-html -D "$DOMAINS" -l $DEPTH "$HOME" 2>&1 \
    | grep '^--' | awk '{ print $3 }' | grep -v '\. \(css\|js\|png\|gif\|jpg\)$' | sort | uniq > $OUTPUT

## gist:5c8592d48108e18d3de0
wget --spider -o wget.log -e robots=off --wait 3 -r -p -S http://

grep -ri 'http://' wget.log | grep -E -v '(files/|\.jpg|\.jpeg|\.gif|\.css|\.js|\.pdf|\.png|\.xls)' | awk '{print $3}'|sort|uniq|sort > site_map.txt

cat $1 |grep -i -E -v '(\.jpg|\.jpeg|\.gif|\.css|\.js|\.pdf|\.png|\.xls|\.ico|\.txt|\.doc|yandexbot|googlebot|YandexDirect|\/upload\/|" 404 |" 301 |" 302 )'|perl -MURI::Escape -lne 'print uri_unescape($_)'|grep yandsearch|awk '{print $1}'|sort|uniq|wc -l

## wget_spider_https
http://addictivecode.org/FrequentlyAskedQuestions

To spider a site as a logged-in user:

1. post the form data (_every_ input with a name in the form, even if it doesn't have a value) required to log in (--post-data).
2. save the cookies that get generated (--save-cookies), including session cookies (--keep-session-cookies), which are not saved when --save-cookies alone is specified.
2. load the cookies, continue saving the session cookies, and recursively (-r) spider (--spider) the site, ignoring (-R) /logout.

# log in and save the cookies
wget --post-data='username=my_username&password=my_password&next=' --save-cookies=cookies.txt --keep-session-cookies https://foobar.com/login

## wget-spider.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jadedgnome
                / wget-spider.md
            
            
              Last active
              August 29, 2015 14:13
                — forked from fedecarg/wget-spider.md
            
          
    Extract links from a BBC responsive site

DOMAIN="m.bbc.co.uk"
SERVICE="hindi"
HTTP_USER_AGENT="Mozilla/5.0 (iPhone; Mobile; AppleWebKit; Safari)"
EXCLUDE_EXTENSIONS="\.\(txt\|css\|js\|png\|gif\|jpg\)$"
MAX_DEPTH="3"

wget --spider --no-directories --no-parent --force-html --recursive \

--level=$MAX_DEPTH --no-clobber \
	#!/bin/bash

	# Rename the ouput html file from redditPostArchiver with the reddit thread title.
	# https://github.com/sJohnsonStoever/redditPostArchiver


	for f in *.html;
	do
	title=$( awk 'BEGIN{IGNORECASE=1;FS="<title>\|</title>";RS=EOF} {print $2}' "$f" )
	mv -i "$f" "${title//[^a-zA-Z0-9\._\- ]}_$f"
	require 'forwardable'

	module RtmpMeta
	class Parser
	PATTERN = /duration\s+(?<duration>\d+\.?\d+)$/
	attr_reader :raw_data

	def initialize raw_data
	@raw_data = raw_data
	end
	#!/usr/bin/env python
	#
	#require: https://github.com/richardasaurus/mega.py
	#
	import os
	import sys
	from mega import Mega

	mega = Mega({'verbose': True})
	m = mega.login('megauseremail', 'megapass')
	from scrapy import log
	from scrapy.item import Item
	from scrapy.http import Request
	from scrapy.contrib.spiders import XMLFeedSpider


	def NextURL():
	"""
	Generate a list of URLs to crawl. You can query a database or come up with some other means
	Note that if you generate URLs to crawl from a scraped URL then you're better of using a
	#!/bin/bash

	# simple function to check http response code before downloading a remote file
	# example usage:
	# if `validate_url $url >/dev/null`; then dosomething; else echo "does not exist"; fi


	function validate_url(){
	if [[ `wget -S --spider $1 2>&1 \| grep 'HTTP/1.1 200 OK'` ]]; then echo "true"; fi
	}
	#!/bin/bash

	HOME="http://www.yourdomain.com/some/page"
	DOMAINS="yourdomain.com"
	DEPTH=2
	OUTPUT="./urls.csv"

	wget -r --spider --delete-after --force-html -D "$DOMAINS" -l $DEPTH "$HOME" 2>&1 \
	\| grep '^--' \| awk '{ print $3 }' \| grep -v '\. \(css\\|js\\|png\\|gif\\|jpg\)$' \| sort \| uniq > $OUTPUT
	wget --spider -o wget.log -e robots=off --wait 3 -r -p -S http://

	grep -ri 'http://' wget.log \| grep -E -v '(files/\|\.jpg\|\.jpeg\|\.gif\|\.css\|\.js\|\.pdf\|\.png\|\.xls)' \| awk '{print $3}'\|sort\|uniq\|sort > site_map.txt

	cat $1 \|grep -i -E -v '(\.jpg\|\.jpeg\|\.gif\|\.css\|\.js\|\.pdf\|\.png\|\.xls\|\.ico\|\.txt\|\.doc\|yandexbot\|googlebot\|YandexDirect\|\/upload\/\|" 404 \|" 301 \|" 302 )'\|perl -MURI::Escape -lne 'print uri_unescape($_)'\|grep yandsearch\|awk '{print $1}'\|sort\|uniq\|wc -l
	http://addictivecode.org/FrequentlyAskedQuestions

	To spider a site as a logged-in user:

	1. post the form data (_every_ input with a name in the form, even if it doesn't have a value) required to log in (--post-data).
	2. save the cookies that get generated (--save-cookies), including session cookies (--keep-session-cookies), which are not saved when --save-cookies alone is specified.
	2. load the cookies, continue saving the session cookies, and recursively (-r) spider (--spider) the site, ignoring (-R) /logout.

	# log in and save the cookies
	wget --post-data='username=my_username&password=my_password&next=' --save-cookies=cookies.txt --keep-session-cookies https://foobar.com/login