johnconroy/crawl_n_scrape_search_results.py

## crawl_n_scrape_search_results.py
#goddam I love crawling and scraping.
#I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
# Was a small crawl (1200 pages??)so I don't think they'd get het up about it.
# Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows
# you to pass GET parameters to a search query.

# ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

#if readlines()[n] contains	<div class="result_thumbnail">:
#scrape readlines()[n+1]
#dismiss first y chars
#dismiss everything after the "
#==twitter screen_name

import os, urllib, time, string

filename="F:\\somedir\\_IREUSERS.txt"

for x in range(1000):  #1000 pages of results
    if x==0:
        thispagestr="http://SPOILER.com/twitter/spoiler/"
    else:
        thispagestr="http://SPOILER.com/twitter/spoiler/"+str(x+1)

    sock=urllib.urlopen(thispagestr)
    htmlsource=sock.readlines()
    file1=open(filename, 'a')

    for n in range(len(htmlsource)):
            if htmlsource[n].rfind('<div class="result_thumbnail">')!=-1:
                    #print htmlsource[n+1]
                    line=htmlsource[n+1]
                    line1=line[31:]

                    lastchar=line1.find('"')
                    name=line1[:lastchar]
                    file1.write(name)

    file1.close()
    time.sleep(30)
	#goddam I love crawling and scraping.
	#I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
	# Was a small crawl (1200 pages??)so I don't think they'd get het up about it.
	# Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows
	# you to pass GET parameters to a search query.

	# ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

	#if readlines()[n] contains <div class="result_thumbnail">:
	#scrape readlines()[n+1]
	#dismiss first y chars
	#dismiss everything after the "
	#==twitter screen_name

	import os, urllib, time, string

	filename="F:\\somedir\\_IREUSERS.txt"

	for x in range(1000): #1000 pages of results
	if x==0:
	thispagestr="http://SPOILER.com/twitter/spoiler/"
	else:
	thispagestr="http://SPOILER.com/twitter/spoiler/"+str(x+1)

	sock=urllib.urlopen(thispagestr)
	htmlsource=sock.readlines()
	file1=open(filename, 'a')

	for n in range(len(htmlsource)):
	if htmlsource[n].rfind('<div class="result_thumbnail">')!=-1:
	#print htmlsource[n+1]
	line=htmlsource[n+1]
	line1=line[31:]

	lastchar=line1.find('"')
	name=line1[:lastchar]
	file1.write(name)

	file1.close()
	time.sleep(30)