Skip to content

Instantly share code, notes, and snippets.

@mertonium
Created May 5, 2011 16:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mertonium/957397 to your computer and use it in GitHub Desktop.
Save mertonium/957397 to your computer and use it in GitHub Desktop.
This script cycles through the Philly311 Knowledgebase, devouring FAQs.
#!/usr/bin/env ruby
###############################################################################
# Philly Open311 Knowledgebase Scraper
###############################################################################
require 'nokogiri'
require "net/http"
require "uri"
require "CSV"
require 'sanitize'
def getUrl(urlStr)
uri = URI.parse(urlStr)
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
return response
end
g = []
# This giant list of IDs represent all of the page ids that actually have pages on the site.
# I basically tried to open every page from 1 to 10,000 and these where the pages that
# returned a 200.
ids = [506,517,645,646,647,648,649,650,651,652,654,655,656,662,665,666,668,669,672,675,682,683,686,687,688,689,690,691,693,694,696,697,699,700,702,703,704,708,711,712,715,717,719,720,721,722,723,726,727,728,729,731,732,733,734,735,736,738,739,740,742,744,746,750,753,758,763,765,767,768,770,772,773,774,775,776,777,778,779,780,781,782,784,785,786,790,792,793,794,795,796,797,798,799,803,806,807,808,810,815,817,824,827,830,832,834,836,838,839,846,848,849,851,852,857,865,866,869,870,871,872,898,900,904,908,909,912,913,914,916,919,921,922,923,928,929,930,932,935,936,937,939,940,941,943,944,945,946,947,948,950,951,952,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,970,972,973,974,975,979,981,983,987,990,991,993,994,995,996,997,1000,1000,1001,1002,1003,1004,1005,1011,1013,1014,1015,1016,1018,1019,1020,1021,1022,1023,1024,1025,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036,1037,1039,1040,1041,1042,1043,1044,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1057,1058,1059,1061,1068,1069,1078,1079,1080,1084,1086,1087,1088,1089,1090,1091,1092,1093,1097,1098,1099,1102,1103,1104,1105,1106,1108,1109,1120,1143,1144,1146,1147,1154,1155,1157,1163,1164,1165,1170,1173,1178,1181,1185,1197,1198,1199,1200,1202,1203,1204,1205,1206,1207,1243,1278,1286,1299,1303,1306,1311,1315,1329,1330,1332,1338,1339,1340,1341,1343,1344,1345,1346,1347,1348,1353,1355,1375,1382,1383,1384,1386,1387,1390,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1407,1413,1414,1415,1422,1423,1424,1425,1427,1428,1431,1436,1440,1441,1442,1444,1445,1447,1449,1450,1451,1452,1453,1454,1455,1456,1458,1461,1462,1463,1464,1467,1469,1470,1472,1473,1474,1475,1476,1477,1478,1479,1480,1483,1484,1485,1487,1488,1491,1492,1494,1495,1497,1498,1499,1500,1508,1510,1511,1512,1513,1515,1517,1526,1538,1541,1542,1543,1546,1561,1576,1577,1578,1579,1580,1581,1582,1584,1585,1594,1595,1596,1597,1599,1600,1601,1603,1604,1606,1608,1610,1611,1612,1613,1614,1616,1617,1618,1619,1621,1623,1641,1647,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1675,1685,1696,1697,1701,1703,1709,1710,1732,1734,1752,1774,1788,1794,1797,1808,1815,1818,1820,1821,1823,1824,1825,1826,1834,1844,1847,1848,1849,1921,1922,1923,1924,1926,1927,1939,1940,1941,1943,1944,1945,1947,1949,1951,1952,1953,1954,1958,1961,1965,1988,2002,2008,2017,2018,2021,2022,2025,2026,2027,2031,2032,2033,2034,2044,2045,2046,2047,2049,2050,2052,2056,2062,2063,2064,2066,2067,2068,2069,2070,2071,2073,2074,2075,2076,2077,2086,2103,2140,2141,2143,2144,2162,2164,2169,2170,2172,2188,2189,2190,2199,2200,2202,2204,2206,2207,2208,2210,2212,2229,2239,2249,2280,2289,2297,2298,2299,2305,2314,2335,2336,2343,2348,2356,2366,2376,2377,2378,2379,2380,2381,2382,2383,2384,2385,2388,2404,2405,2406,2407,2409,2410,2413,2414,2421,2443,2444,2447,2448,2463,2464,2465,2467,2468,2470,2474,2479,2480,2487,2493,2496,2497,2514,2516,2517,2538,2564,2565,2582,2593,2730,2742,2754,2755,2762,2832,2964,3063,3064,3072,3216,3416,3547,4009,4177,4469,4472,4476,4526,4540,4554,4637,4640,4723,4742,4782,4787,4794,4835,4874,4877,4931,4964,4986,5587,5596,5625,5644,5648,5671,5672,5675,5680,5732,5734,5829,5833,5853,5855,5965,5976,5977,6178,6216,6219,6221,6222,6229,6231,6243,6265,6268,6270,6290,6296,6297,6353,6393,6394,6405,6415,6464,6486,6489,6651,6668,6693]
#for i in 5976..5980
ids.each do |i|
puts i
# retrieve a page
base_url = 'http://philly311.phila.gov/default.asp?SID=&Lang=1&id='+i.to_s
resp = getUrl(base_url)
if resp.code == "200"
puts "hit!"
doc = Nokogiri::HTML(resp.body)
title = doc.css('td.shortTitle').empty? ? doc.css('td.longTitle').children.first.inner_text : doc.css('td.shortTitle').children.first.inner_text
content = doc.css('table#Table5 td.content').inner_html
content = Sanitize.clean(content, :elements => ['a'], :attributes => {'a' => ['href', 'title']}, :protocols => {'a' => {'href' => ['http', 'https', 'mailto']}})
# content = content.squeeze(" ").strip.gsub(/\s+/, ' ')
if !content.index('?').nil?
faqs = content.split("\n")
puts faqs.size
title = ''
faqs.each do |line|
if line.index('?') # There is a question here
if title != ''
g << [i, title, content, doc.css('td.articledata').inner_text, base_url.to_s]
end
title = line.slice!(0..line.rindex('?')).squeeze(" ").strip.gsub(/\s+/, ' ')
content = line.squeeze(" ").strip.gsub(/\s+/, ' ') + ' '
else # No question, just mushing together the answers
content << line.squeeze(" ").strip.gsub(/\s+/, ' ') + ' '
end
end
end
if content.size > 0
g << [i, title, content.squeeze(" ").strip.gsub(/\s+/, ' '), doc.css('td.articledata').inner_text, base_url.to_s]
end
end
end
CSV.open("philly_scraper.csv", "w", { :force_quotes => true }) do |csv|
csv << ['id','qustion','answer','mod_date','url']
g.each do |x|
csv << x
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment