metasyn/splunk_app_info_scraper.py

## splunk_app_info_scraper.py
# Splunk App Information Scraper
# Note, it also scapes TAs

# Alexander Johnson
# xander@splunk.com
# @metasyn

# requires lxml

import re
from lxml import html
import requests
import json

def get_app_url_numbers(estimate, limit):
	"""
	:params:
		estimate
			put in the number of apps you want returned (int)

		limit
			put the highest url app number you're willing to search through (int)

	:returns:
		a list of numbers that represent the url number in
		https://apps.splunk.com/app/#

	"""

	valid_app_numbers = []

	while len(valid_app_numbers) <= estimate:
		for i in range(0, limit):
			page = requests.get("https://apps.splunk.com/app/" + str(i))
			if page.ok == True:
				valid_app_numbers.append(i)
				print "APP @ ", i
			else:
				if i % 100 == 0:
					print i

	return valid_app_numbers

	## So I couldn't really think of a better way to do this, since I had no idea how far apart
	## they were spaced. So this just queries each page with a GET, and if it's OK, then we
	## save the page number. I've attached a text list of the current valid pages up to 5000

def get_information(app_number):

	# get page
	page = requests.get('https://apps.splunk.com/app/' + str(app_number))
	tree = html.fromstring(page.text)

	# get info
	name = tree.xpath('//h1[@class="app-title ns-textfill clearfix"]/text()')
	ratings = tree.xpath('//span[@class="review-count"]/text()')
	downloads = re.findall(r'(?<="Downloads"></i>)\d{1,3},?\d{0,3}', page.text)
	compatability = re.findall(r'(?<=title="Splunk Compatibility" data-placement="bottom"></i>).+(?=<)', page.text)
	platform = re.findall(r'(?<=title="Platforms" data-placement="bottom"></i>).+(?=<)', page.text)
	license = re.findall(r'(?<=title="License"></i>).+(?=<)', page.text)

	# get compatability into a list of numbers
	# de duplicate, unlist, strip whitespace, convert to float
	compatability = list(set(compatability))
	version_bucket = []
	for version in compatability:
		if len(version) > 1:
			version_list = version.split(',')
			for item in version_list:
				version_bucket.append(item)
		else:
			version_bucket.append(version)
	compatability = [float(version.strip()) for version in version_bucket]

	# also need to turn ratings into an int
	ratings = int(ratings[0][:2].strip())


	# return values in a dictionary
	app_data = {}
	try:
		app_data = {
		"name": name,
		"ratings": ratings,
		"downloads": downloads,
		"platform": list(set(platform)),
		"license": license,
		"compatability": compatability}
	except IndexError:
		print(app_number + " missing fields!")
		invalid_numbers.append(app_number)

	json.dump(app_data, open(app_number + "_data.json", 'w'))

def serialize_to_json(app_list_file):
	app_list = open(app_list_file, 'r').read().split()
	app_count = len(app_list)
	app_dict = {}
	progress = 0.

	for app_num in app_list:
		app_dict[app_num] = get_information(app_num)
		progress += 1
		print "Scraping app {} of {}, {:.2%} done".format(int(progress), app_count, (progress/app_count))

serialize_to_json("valid_app_numbers.txt")

## valid_app_numbers.txt
27 29 30 174 201 202 217 226 230 237 240 241 245 263 267 272 273 277 278 279 281 282 285 286 287 288 291 294 295 300 312 314 316 321 324 327 334 335 337 339 340 343 344 345 352 355 357 359 362 366 367 368 370 379 388 389 392 396 407 409 413 421 430 431 436 442 453 455 456 463 466 481 483 485 486 487 489 491 493 494 505 506 507 510 511 512 513 525 527 528 529 533 535 554 557 558 562 568 572 573 574 576 581 585 586 587 588 599 610 612 613 624 626 627 633 635 639 647 656 662 664 668 669 671 679 685 687 689 690 698 700 702 703 705 706 711 715 725 726 729 734 736 738 742 746 748 749 760 767 770 773 777 779 786 796 802 808 812 814 815 816 817 819 822 826 829 830 833 836 837 838 839 842 843 844 847 851 852 858 870 872 874 880 884 891 900 905 907 911 920 925 926 929 932 933 936 949 950 952 953 954 955 956 958 964 968 970 972 976 978 980 981 994 995 1000 1004 1007 1009 1016 1020 1021 1025 1033 1035 1037 1042 1051 1052 1059 1063 1072 1075 1082 1088 1090 1093 1095 1097 1099 1107 1111 1121 1125 1126 1127 1129 1131 1134 1136 1137 1140 1142 1143 1145 1147 1149 1151 1155 1161 1162 1167 1170 1173 1177 1179 1180 1182 1186 1193 1195 1197 1199 1201 1207 1209 1214 1217 1220 1223 1224 1225 1228 1241 1249 1253 1254 1257 1259 1263 1265 1267 1274 1280 1282 1289 1292 1293 1294 1297 1306 1307 1310 1311 1314 1315 1316 1317 1318 1324 1327 1340 1343 1346 1348 1351 1352 1353 1355 1365 1366 1376 1383 1386 1397 1401 1421 1423 1425 1429 1436 1438 1441 1444 1445 1448 1449 1453 1454 1457 1460 1463 1464 1465 1466 1467 1469 1470 1471 1473 1476 1477 1486 1493 1494 1498 1515 1517 1520 1521 1524 1525 1527 1528 1532 1535 1536 1537 1538 1539 1540 1541 1542 1544 1545 1546 1547 1548 1549 1551 1553 1554 1558 1559 1560 1561 1562 1563 1564 1567 1574 1577 1578 1579 1580 1581 1584 1585 1586 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1617 1618 1619 1620 1621 1622 1623 1624 1625 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1639 1640 1645 1653 1654 1657 1658 1659 1660 1661 1663 1664 1665 1666 1667 1679 1680 1701 1702 1703 1704 1705 1706 1707 1708 1710 1711 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1741 1742 1743 1744 1747 1749 1750 1751 1752 1753 1755 1757 1758 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1775 1776 1777 1779 1780 1781 1783 1789 1790 1791 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1817 1818 1819 1820 1822 1823 1824 1825 1826 1827 1830 1831 1832 1833 1835 1836 1837 1838 1839 1840 1842 1843 1844 1845 1847 1848 1849 1850 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1866 1867 1868 1869 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880
	# Splunk App Information Scraper
	# Note, it also scapes TAs

	# Alexander Johnson
	# xander@splunk.com
	# @metasyn

	# requires lxml

	import re
	from lxml import html
	import requests
	import json

	def get_app_url_numbers(estimate, limit):
	"""
	:params:
	estimate
	put in the number of apps you want returned (int)

	limit
	put the highest url app number you're willing to search through (int)

	:returns:
	a list of numbers that represent the url number in
	https://apps.splunk.com/app/#

	"""

	valid_app_numbers = []

	while len(valid_app_numbers) <= estimate:
	for i in range(0, limit):
	page = requests.get("https://apps.splunk.com/app/" + str(i))
	if page.ok == True:
	valid_app_numbers.append(i)
	print "APP @ ", i
	else:
	if i % 100 == 0:
	print i

	return valid_app_numbers

	## So I couldn't really think of a better way to do this, since I had no idea how far apart
	## they were spaced. So this just queries each page with a GET, and if it's OK, then we
	## save the page number. I've attached a text list of the current valid pages up to 5000

	def get_information(app_number):

	# get page
	page = requests.get('https://apps.splunk.com/app/' + str(app_number))
	tree = html.fromstring(page.text)

	# get info
	name = tree.xpath('//h1[@class="app-title ns-textfill clearfix"]/text()')
	ratings = tree.xpath('//span[@class="review-count"]/text()')
	downloads = re.findall(r'(?<="Downloads"></i>)\d{1,3},?\d{0,3}', page.text)
	compatability = re.findall(r'(?<=title="Splunk Compatibility" data-placement="bottom"></i>).+(?=<)', page.text)
	platform = re.findall(r'(?<=title="Platforms" data-placement="bottom"></i>).+(?=<)', page.text)
	license = re.findall(r'(?<=title="License"></i>).+(?=<)', page.text)

	# get compatability into a list of numbers
	# de duplicate, unlist, strip whitespace, convert to float
	compatability = list(set(compatability))
	version_bucket = []
	for version in compatability:
	if len(version) > 1:
	version_list = version.split(',')
	for item in version_list:
	version_bucket.append(item)
	else:
	version_bucket.append(version)
	compatability = [float(version.strip()) for version in version_bucket]

	# also need to turn ratings into an int
	ratings = int(ratings[0][:2].strip())


	# return values in a dictionary
	app_data = {}
	try:
	app_data = {
	"name": name,
	"ratings": ratings,
	"downloads": downloads,
	"platform": list(set(platform)),
	"license": license,
	"compatability": compatability}
	except IndexError:
	print(app_number + " missing fields!")
	invalid_numbers.append(app_number)

	json.dump(app_data, open(app_number + "_data.json", 'w'))

	def serialize_to_json(app_list_file):
	app_list = open(app_list_file, 'r').read().split()
	app_count = len(app_list)
	app_dict = {}
	progress = 0.

	for app_num in app_list:
	app_dict[app_num] = get_information(app_num)
	progress += 1
	print "Scraping app {} of {}, {:.2%} done".format(int(progress), app_count, (progress/app_count))

	serialize_to_json("valid_app_numbers.txt")