Skip to content

Instantly share code, notes, and snippets.

@ankitagarwal248
Last active August 29, 2015 14:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ankitagarwal248/25473bcd7f58cdac8007 to your computer and use it in GitHub Desktop.
Save ankitagarwal248/25473bcd7f58cdac8007 to your computer and use it in GitHub Desktop.
import socks
import urllib2
import requests
from Queue import Queue
from threading import Thread
import httplib
import time
links = ["http://timesofindia.indiatimes.com/india/India-to-fortify-defence-along-China-border-54-new-ITBP-posts-being-planned-in-Arunachal/articleshow/36353494.cms",
"http://timesofindia.indiatimes.com/entertainment/hindi/music/music-reviews/Breaking-Bad/articleshow/39370864.cms",
"http://www.thehindu.com/news/national/other-states/at-least-2-russian-tourists-killed-as-bus-falls-into-river-in-uttarakhand/article6101106.ece",
"http://profit.ndtv.com/news/corporates/article-bank-of-india-seeks-rs-2-000-crore-fund-infusion-from-government-547596",
"http://doctor.ndtv.com/topicdetails/ndtv/tid/540/Body_Mass_Index_(BMI).html",
"http://health.sify.com/types-of-diabetes-in-india/",
"http://healthyliving.msn.com/nutrition/make-exercise-fun-eat-less-afterwards",
"https://in.lifestyle.yahoo.com/chitrangada-shares-her-secrets.html",
"http://www.ndtv.com/article/india/sgpc-row-punjab-chief-minister-says-will-obey-akal-takht-diktat-565781?pfrom=home-otherstories",
"http://elle.in/fashion/news/jodi-launches-today/",
"http://www.businessworld.in/news/after-hours/fitness/fusion-fitness/1457319/page-1.html",
"http://economictimes.indiatimes.com/industry/banking/finance/finance/piramal-enterprises-gets-a-partner-in-hollands-apg-for-1-billion-infrastructure-joint-venture/articleshow/39258788.cms",
"http://indiatoday.intoday.in/story/natwar-singh-interview-rajiv-gandhi-sri-lanka-ltte-prabhakaran/1/374978.html",
"http://www.cseblog.com/2014/08/caterers-problem.html",
"http://www.espncricinfo.com/england-v-india-2014/content/current/story/769823.html",
"http://www.deccanherald.com/content/428195/george-clooney-most-gracefully-ageing.html",
"http://www.indiaparenting.com/article-display.php?id=2947&cid=121&c=pregnancy&a=complaints-by-trimester"
]
class SocksiPyConnection(httplib.HTTPConnection):
def __init__(self, proxytype, proxyaddr, proxyport = None, rdns = True, username = None, password = None, *args, **kwargs):
self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password)
httplib.HTTPConnection.__init__(self, *args, **kwargs)
def connect(self):
self.sock = socks.socksocket()
self.sock.setproxy(*self.proxyargs)
if isinstance(self.timeout, float):
self.sock.settimeout(self.timeout)
self.sock.connect((self.host, self.port))
class SocksiPyHandler(urllib2.HTTPHandler):
def __init__(self, *args, **kwargs):
self.args = args
self.kw = kwargs
urllib2.HTTPHandler.__init__(self)
def http_open(self, req):
def build(host, port=None, strict=None, timeout=0):
conn = SocksiPyConnection(*self.args, host=host, port=port, strict=strict, timeout=timeout, **self.kw)
return conn
return self.do_open(build, req)
def fetch_links(q, port):
if q.empty():
return
else:
try:
s = socks.socksocket()
opener = urllib2.build_opener(SocksiPyHandler(socks.PROXY_TYPE_SOCKS4, "127.0.0.1", port, True))
url = q.get()
h = opener.open(url)
data = h.read()
code = h.getcode()
print port
print url[0:20]
# # print len(data)
# # print code
print"----------"
file_name = url.replace('/','-').replace(':','-').replace('.','-')
filepath = '/home/ankit/python/tor/urldata/' + file_name + '.txt'
data_file = open(filepath, "w")
data_file.write(data)
data_file.close()
q.task_done()
completed_urls.append(url)
time.sleep(1.5)
return
except:
print "error"
error_urls.append(url)
return
num_workers = 30
base_port = 9050
num_urls = 1000
workers = []
completed_urls = []
error_urls = []
q = Queue(maxsize=0)
for x in links:
q.put(x)
for i in range(num_workers):
new_port = base_port + i
worker = Thread(target=fetch_links, args=(q, new_port))
worker.setDaemon = True
workers.append(worker)
for w in workers:
w.start()
for w in workers:
w.join()
print "complited urls"
print completed_urls
print "error_urls"
print error_urls
#!/bin/bash
base_socks_port=9050
base_control_port=8118
# Create data directory if it doesn't exist
if [ ! -d "data" ]; then
mkdir "data"
fi
#for i in {0..10}
for i in {0..80}
do
j=$((i+1))
socks_port=$((base_socks_port+i))
control_port=$((base_control_port+i))
if [ ! -d "data/tor$i" ]; then
echo "Creating directory data/tor$i"
mkdir "data/tor$i"
fi
# Take into account that authentication for the control port is disabled. Must be used in secure and controlled environments
echo "Running: tor --RunAsDaemon 1 --CookieAuthentication 0 --HashedControlPassword \"\" --ControlPort $control_port --PidFile tor$i.pid --SocksPort $socks_port --DataDirectory data/tor$i"
tor --RunAsDaemon 1 --CookieAuthentication 0 --HashedControlPassword "" --ControlPort $control_port --PidFile tor$i.pid --SocksPort $socks_port --DataDirectory data/tor$i
done
@ankitagarwal248
Copy link
Author

Install Tor, then run --> "sudo /etc/init.d/tor start"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment