johnconroy

## tfidf java
package tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;

## pagerank_twitter_users.py
#pagerank calculation for a list of Twitter users
#implementing this was a bit of a head-scratcher...

f1="C:\\somedir\\list_of_edges_between_usrs.txt"  #our social graph, as an edge-list
file1=open(f1,'r')
edges=file1.readlines()

l_nodes,l_outd,nodes2=[],[],[]
l_score, l_score_new, l_out_score=[],[],[]

## pagerank_twitter_users.py
#pagerank calculation for a list of Twitter users
#implementing this was a bit of a head-scratcher...

f1="C:\\somedir\\list_of_edges_between_usrs.txt"  #our social graph, as an edge-list
file1=open(f1,'r')
edges=file1.readlines()

l_nodes,l_outd,nodes2=[],[],[]
l_score, l_score_new, l_out_score=[],[],[]

## resolve_bitly_urls.py
# resolving urls shortened with Bit.ly back to their original form
# Bit.ly has a beautiful, simple API for this. And the Bitly API wrapper is simple to use.
# I did this for analysing which domains in a large list were the most popular.


import bitly #bitly API wrapper
import time
import random

fr="C:\\somedir\\bitlyurls.txt"

## crawl_n_scrape_search_results.py
#goddam I love crawling and scraping.
#I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
# Was a small crawl (1200 pages??)so I don't think they'd get het up about it.
# Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows
# you to pass GET parameters to a search query.

# ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

#if readlines()[n] contains	<div class="result_thumbnail">:
#scrape readlines()[n+1]

## scrape_twitter_timeline.py
#Query the Twitter Public Timeline ~every couple minutes. For each user listed, save their status and location.
#I used this to examine what % of twitter users provide resolvable locations, and potentially to analyse
#those locations (I had an idea I might try passing them thru Yahoo Geolocation API to get a country by
#dcountry breakdown, but in the end I didn't)

import twitter
import time


#set up files

## check_if_twitter_user_posted_in_last_7_days.py
#script to see whether a user is 'active' during the previous 7 days
#I had a large list of Twitter users. I wanted to see if they remained 'active' from one week to the next
# active means they posted a message at least once during the previous 7 days
#Requires DeWitt Clinton's "Python-Twitter" api wrapper
# some nice date-handling stuff in here
# In a seperate script, I iterated thru a list of Twitter users, passing each in turn into this script


import twitter

## iterate_thru_dic_csv_extract.py
#assume we have a bunch of files in a directory
#each file is csv... comma seperated. eg it has a bunch of lines like this"henry_1, 9939393, 02/11/1991, ARTS2bc1, ..."
#assume the first comma seperated value on each line in each file is the name
#Assume we want a list of unique names in all the files in the directory

import os

basedir="c://somedir//somesubdir//"

for root, dirs, files in os.walk(basedir): #iterates thru each file in directory, treating each file name as a string
	package tfidf;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.FileReader;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.text.DecimalFormat;
	import java.util.ArrayList;
	#pagerank calculation for a list of Twitter users
	#implementing this was a bit of a head-scratcher...

	f1="C:\\somedir\\list_of_edges_between_usrs.txt" #our social graph, as an edge-list
	file1=open(f1,'r')
	edges=file1.readlines()

	l_nodes,l_outd,nodes2=[],[],[]
	l_score, l_score_new, l_out_score=[],[],[]
	# resolving urls shortened with Bit.ly back to their original form
	# Bit.ly has a beautiful, simple API for this. And the Bitly API wrapper is simple to use.
	# I did this for analysing which domains in a large list were the most popular.


	import bitly #bitly API wrapper
	import time
	import random

	fr="C:\\somedir\\bitlyurls.txt"
	#goddam I love crawling and scraping.
	#I used this script to scrape a particular Twitter user directory to query & acquire a list of Twitter users from Ireland. Great fun.
	# Was a small crawl (1200 pages??)so I don't think they'd get het up about it.
	# Their search results in this case came via POST. If they hadn't, I cuda used Python's urllib2 library instead, which allows
	# you to pass GET parameters to a search query.

	# ... Looking at this, this was a ridiculously simple crawl... but I can't seem to find anything slightly tougher :(

	#if readlines()[n] contains <div class="result_thumbnail">:
	#scrape readlines()[n+1]
	#Query the Twitter Public Timeline ~every couple minutes. For each user listed, save their status and location.
	#I used this to examine what % of twitter users provide resolvable locations, and potentially to analyse
	#those locations (I had an idea I might try passing them thru Yahoo Geolocation API to get a country by
	#dcountry breakdown, but in the end I didn't)

	import twitter
	import time


	#set up files
	#script to see whether a user is 'active' during the previous 7 days
	#I had a large list of Twitter users. I wanted to see if they remained 'active' from one week to the next
	# active means they posted a message at least once during the previous 7 days
	#Requires DeWitt Clinton's "Python-Twitter" api wrapper
	# some nice date-handling stuff in here
	# In a seperate script, I iterated thru a list of Twitter users, passing each in turn into this script



	import twitter
	#assume we have a bunch of files in a directory
	#each file is csv... comma seperated. eg it has a bunch of lines like this"henry_1, 9939393, 02/11/1991, ARTS2bc1, ..."
	#assume the first comma seperated value on each line in each file is the name
	#Assume we want a list of unique names in all the files in the directory

	import os

	basedir="c://somedir//somesubdir//"

	for root, dirs, files in os.walk(basedir): #iterates thru each file in directory, treating each file name as a string