Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A simple Python script, designed to extract all public project details ( as JSON ), from a GitHub User Profile, by scraping out webpages. Made with ❤️
#!/usr/bin/python3
from requests import get
from sys import argv
from json import dump
from os.path import dirname, abspath, join
from urllib.parse import urljoin
from dateutil.parser import parse
from itertools import chain
from functools import reduce
try:
from bs4 import BeautifulSoup
except ImportError as e:
print('[!]Module Unavailable : {}'.format(str(e)))
exit(1)
'''
extracts required data in form of a chained iterables,
which are to be converted to Dict[String, Any], beforing writing to
JSON file using JSON encoder
There may be some situation when a certain profile might have
so many repositories that they are paginated, and we need to traverse
through all those pages, which is why we'll require to perform
recursive call to this function
This function body is having closures defined
'''
def extractIt(dataObject, username: str, targetClass: str = 'col-10 col-lg-9 d-inline-block'):
def __extractFromThisPage__():
def __findURL__(element) -> str:
try:
return urljoin('https://github.com', element.findChild('div', {'class': 'd-inline-block mb-1'}).h3.a.get('href'))
except Exception:
return ''
def __findDescription__(element) -> str:
try:
return element.findChild('p', {'class': 'col-9 d-inline-block text-gray mb-2 pr-4'}).text.strip()
except Exception:
return ''
def __findLanguage__(element) -> str:
try:
return element.findChild('div', {'class': 'f6 text-gray mt-2'}).findChild('span', {'itemprop': 'programmingLanguage'}).text.strip()
except Exception:
return ''
def __findLanguageColor__(element) -> str:
try:
return element.findChild('div', {'class': 'f6 text-gray mt-2'}).findChild('span', {'class': 'ml-0 mr-3'}).findChild('span', {'class': 'repo-language-color'}).get('style').split(':')[1].strip()
except Exception:
return ''
def __findForkCount__(element) -> int:
try:
return reduce(lambda acc, cur: acc + int(cur.text.strip()), filter(lambda e: e.get(
'href').endswith('members'), element.findChild('div', {'class': 'f6 text-gray mt-2'}).findAll('a', {'class': 'muted-link mr-3'})), 0)
except Exception:
return 0
def __findStargazerCount__(element) -> int:
try:
return reduce(lambda acc, cur: acc + int(cur.text.strip()), filter(lambda e: e.get(
'href').endswith('stargazers'), element.findChild('div', {'class': 'f6 text-gray mt-2'}).findAll('a', {'class': 'muted-link mr-3'})), 0)
except Exception:
return 0
'''
Gets timestamp in second from epoch to designate,
last time when this repository was updated
'''
def __findLastUpdated__(element) -> float:
try:
return parse(element.findChild(
'div', {'class': 'f6 text-gray mt-2'}).findChild('relative-time').get('datetime')).timestamp()
except Exception:
return 0.0
'''
Gets name of License scheme being used for this project,
in case of error, returns empty string
'''
def __findLicense(element) -> str:
try:
return reduce(lambda acc, cur: acc + cur.text.strip(), filter(lambda e: ''.join(e.get('class')) == 'mr-3', element.findChild(
'div', {'class': 'f6 text-gray mt-2'}).findAll('span', {'class': 'mr-3'})), '')
except Exception:
return ''
'''
Checks whether currently inspected repository is forked
from somewhere else or not
If yes, returns URL of source, else returns empty string
'''
def __checkIfForked__(element) -> str:
try:
return urljoin('https://github.com', element.findChild('div', {'class': 'd-inline-block mb-1'}).findChild('span', {'class': 'f6 text-gray mb-1'}).findChild('a', {'class': 'muted-link'}).get('href'))
except Exception:
return ''
return map(lambda e: (e.findChild('div', {'class': 'd-inline-block mb-1'}).h3.a.text.strip(), {
'url': __findURL__(e),
'description': __findDescription__(e),
'lang': __findLanguage__(e),
'langColor': __findLanguageColor__(e),
'fork': __findForkCount__(e),
'star': __findStargazerCount__(e),
'updated': __findLastUpdated__(e),
'license': __findLicense(e),
'forkedFrom': __checkIfForked__(e)}), dataObject.findAll('div', {'class': targetClass}))
try:
paginatedObject = dataObject.findChild(
'div', {'class': 'paginate-container'})
if(paginatedObject):
linksToFollow = paginatedObject.div.findAll('a')
if(len(linksToFollow) == 2):
return chain(__extractFromThisPage__(), extractIt(BeautifulSoup(fetchIt(
linksToFollow[1].get('href')), features='html.parser'), username=username))
else:
if(linksToFollow[0].text.strip().lower() == 'next'):
return chain(__extractFromThisPage__(), extractIt(BeautifulSoup(fetchIt(
linksToFollow[0].get('href')), features='html.parser'), username=username))
else:
return __extractFromThisPage__()
else:
return __extractFromThisPage__()
except Exception as e:
return None
'''
fetches content of target webpage, which will be parsed using some `html` parser
'''
def fetchIt(url: str) -> str:
try:
resp = get(url)
return resp.content if(resp.status_code is 200) else None
except Exception:
return None
'''
main entry point of script, calls required methods in
proper order to finally write extracted data into target_file as JSON string
Takes one mandatory argument, username i.e. GitHub username, this is the
account which our crawler is going to scrape through, for extracting GitHub projects information
'''
def app(username: str, target_file: str = abspath(join(dirname(__file__), 'projects.json'))) -> bool:
try:
with open(target_file, 'w') as fd:
dump(dict(extractIt(BeautifulSoup(fetchIt('https://github.com/{}?tab=repositories'.format(
username)), features='html.parser'), username=username)),
fd, ensure_ascii=False, indent=4)
return True
except Exception:
return False
def getScriptName() -> str:
return '\x1b[1;6;33;48mextractZ v0.1.0\x1b[0m'
if __name__ == '__main__':
try:
print(getScriptName())
if len(argv) == 2:
print('\nSuccess' if app(argv[1]) else '\nFailure')
else:
print('\n[+]Usage : {} GithubUserId\n\n:)'.format(argv[0]))
except KeyboardInterrupt:
print('\n[!]Terminated')
finally:
exit(0)
{
"indian-railway": {
"url": "https://github.com/itzmeanjan/indian-railway",
"description": "Exploring Indian Railways time table dataset, with ❤️",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 0,
"updated": 1572017180.0,
"license": "MIT License",
"forkedFrom": ""
},
"corporateZ": {
"url": "https://github.com/itzmeanjan/corporateZ",
"description": "Data analysis done on Ministry of Corporate Affairs, Govt. of India's open data to get deeper insight, with ❤️",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 3,
"updated": 1571987670.0,
"license": "MIT License",
"forkedFrom": ""
},
"chanalyze": {
"url": "https://github.com/itzmeanjan/chanalyze",
"description": "A simple WhatsApp Chat Analyzer ( for both Private & Group chats ), made with ❤️",
"lang": "Python",
"langColor": "#3572A5",
"fork": 1,
"star": 10,
"updated": 1571834452.0,
"license": "MIT License",
"forkedFrom": ""
},
"airQ": {
"url": "https://github.com/itzmeanjan/airQ",
"description": "A near real time Air Quality Indicator, written in Julia with ❤️",
"lang": "Julia",
"langColor": "#a270ba",
"fork": 0,
"star": 4,
"updated": 1570598115.0,
"license": "MIT License",
"forkedFrom": ""
},
"itzmeanjan.github.io": {
"url": "https://github.com/itzmeanjan/itzmeanjan.github.io",
"description": "A website",
"lang": "HTML",
"langColor": "#e34c26",
"fork": 0,
"star": 0,
"updated": 1570550803.0,
"license": "MIT License",
"forkedFrom": ""
},
"airQ-insight": {
"url": "https://github.com/itzmeanjan/airQ-insight",
"description": "Works on Air Quality Indication Dataset collected by airQ to give deeper insight, written with ❤️ using Python",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 1,
"updated": 1569291604.0,
"license": "MIT License",
"forkedFrom": ""
},
"pynotif": {
"url": "https://github.com/itzmeanjan/pynotif",
"description": "A very simple to use Python API for accessing native notification service on Linux",
"lang": "C",
"langColor": "#555555",
"fork": 0,
"star": 0,
"updated": 1569249700.0,
"license": "MIT License",
"forkedFrom": ""
},
"streamZ": {
"url": "https://github.com/itzmeanjan/streamZ",
"description": "A simple video streaming application made with Dart, JavaScript, HTML, CSS & ❤️",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 1,
"star": 18,
"updated": 1568991471.0,
"license": "MIT License",
"forkedFrom": ""
},
"itzmeanjan.in": {
"url": "https://github.com/itzmeanjan/itzmeanjan.in",
"description": "A simple to follow guide, for deploying your website on Amazon LightSail",
"lang": "HTML",
"langColor": "#e34c26",
"fork": 0,
"star": 0,
"updated": 1567519427.0,
"license": "MIT License",
"forkedFrom": ""
},
"vb_notice_reader": {
"url": "https://github.com/itzmeanjan/vb_notice_reader",
"description": "A simple App for fetching and displaying parsed notices from Visva-Bharati, Santiniketan's Official Website, made with ❤️ using Flutter",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 1,
"updated": 1563198079.0,
"license": "MIT License",
"forkedFrom": ""
},
"osm": {
"url": "https://github.com/itzmeanjan/osm",
"description": "A flutter package, helps you to integrate Open Street Map within your flutter app. Built with <3",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 1,
"updated": 1563197998.0,
"license": "MIT License",
"forkedFrom": ""
},
"weatherz-desktop": {
"url": "https://github.com/itzmeanjan/weatherz-desktop",
"description": "A simple JavaScript Desktop Application for fetching and displaying Weather Data of almost 6.36M places along with Maps 😉",
"lang": "JavaScript",
"langColor": "#f1e05a",
"fork": 0,
"star": 1,
"updated": 1563127928.0,
"license": "MIT License",
"forkedFrom": ""
},
"countryAndWeather": {
"url": "https://github.com/itzmeanjan/countryAndWeather",
"description": "A collection of scripts for grabbing and processing detailed Country/ Place Record & Weather Data of 6.36M places, written in Python with ❤️",
"lang": "Python",
"langColor": "#3572A5",
"fork": 1,
"star": 4,
"updated": 1562823161.0,
"license": "MIT License",
"forkedFrom": ""
},
"locatorz": {
"url": "https://github.com/itzmeanjan/locatorz",
"description": "An Android App, which lets you work on Location Data, built with ❤️ using Flutter",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 7,
"star": 30,
"updated": 1561313295.0,
"license": "MIT License",
"forkedFrom": ""
},
"intent": {
"url": "https://github.com/itzmeanjan/intent",
"description": "A simple Flutter plugin to deal with Android Intents, written with ❤️",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 1,
"star": 43,
"updated": 1561308650.0,
"license": "MIT License",
"forkedFrom": ""
},
"vb_noticeboard": {
"url": "https://github.com/itzmeanjan/vb_noticeboard",
"description": "A simple Dart wrapper for fetching, parsing and extracting notices from Visva-Bharati, Santiniketan's Official website",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 2,
"updated": 1559807848.0,
"license": "MIT License",
"forkedFrom": ""
},
"fCreate": {
"url": "https://github.com/itzmeanjan/fCreate",
"description": "A Simple EYE Test Game, built using Flutter CustomPainter, with ❤️",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 3,
"star": 6,
"updated": 1559807561.0,
"license": "MIT License",
"forkedFrom": ""
},
"locate": {
"url": "https://github.com/itzmeanjan/locate",
"description": "A Flutter plugin to work with Android Location Services.",
"lang": "Kotlin",
"langColor": "#F18E33",
"fork": 1,
"star": 7,
"updated": 1559402052.0,
"license": "MIT License",
"forkedFrom": ""
},
"translate": {
"url": "https://github.com/itzmeanjan/translate",
"description": "A simple to use Dart package, for detecting & translating text/ html pages using Yandex.Translate API",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 4,
"updated": 1559401026.0,
"license": "MIT License",
"forkedFrom": ""
},
"countryIO": {
"url": "https://github.com/itzmeanjan/countryIO",
"description": "A simple country.io data parser, written with ❤️ using Dart :)",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 3,
"updated": 1559141667.0,
"license": "MIT License",
"forkedFrom": ""
},
"transferZ": {
"url": "https://github.com/itzmeanjan/transferZ",
"description": "A simple Android Application built with ❤️ using Flutter, for transferring files between devices.",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 9,
"star": 25,
"updated": 1558809339.0,
"license": "MIT License",
"forkedFrom": ""
},
"weatherz": {
"url": "https://github.com/itzmeanjan/weatherz",
"description": "Another Weather App, built with Flutter ;)",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 1,
"star": 3,
"updated": 1557992172.0,
"license": "MIT License",
"forkedFrom": ""
},
"osm_tile_fetcher": {
"url": "https://github.com/itzmeanjan/osm_tile_fetcher",
"description": "A Dart Implementation of Open Street Map Tile Fetcher.",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 0,
"star": 2,
"updated": 1557742915.0,
"license": "MIT License",
"forkedFrom": ""
},
"trackm3": {
"url": "https://github.com/itzmeanjan/trackm3",
"description": "A Live Location Tracker built with Electron, Express & Flutter",
"lang": "JavaScript",
"langColor": "#f1e05a",
"fork": 2,
"star": 2,
"updated": 1557314031.0,
"license": "MIT License",
"forkedFrom": ""
},
"sensorz": {
"url": "https://github.com/itzmeanjan/sensorz",
"description": "A Simple Flutter Android App, which displays Sensor Data.",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 18,
"star": 49,
"updated": 1557256385.0,
"license": "MIT License",
"forkedFrom": ""
},
"astroZ": {
"url": "https://github.com/itzmeanjan/astroZ",
"description": "An Android App, shows Astronomy Picture of the Day, built with ❤️ using Flutter :)",
"lang": "Dart",
"langColor": "#00B4AB",
"fork": 4,
"star": 20,
"updated": 1556977028.0,
"license": "MIT License",
"forkedFrom": ""
},
"transferZ-desktop": {
"url": "https://github.com/itzmeanjan/transferZ-desktop",
"description": "A Desktop client for transferZ, file transfer app, built with Electron and Node",
"lang": "JavaScript",
"langColor": "#f1e05a",
"fork": 0,
"star": 0,
"updated": 1556475263.0,
"license": "MIT License",
"forkedFrom": ""
},
"mapZ": {
"url": "https://github.com/itzmeanjan/mapZ",
"description": "A Geospatial Application",
"lang": "Python",
"langColor": "#3572A5",
"fork": 4,
"star": 12,
"updated": 1555417489.0,
"license": "MIT License",
"forkedFrom": ""
},
"apod_fetcher": {
"url": "https://github.com/itzmeanjan/apod_fetcher",
"description": "A Pythonic wayout to fetch Astronomy Picture of the Day from NASA and populate Database.",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 0,
"updated": 1554833836.0,
"license": "MIT License",
"forkedFrom": ""
},
"apod_server": {
"url": "https://github.com/itzmeanjan/apod_server",
"description": "A simple Express App to be used as API endpoint to facilitate query of Astronomy Picture of the Day",
"lang": "JavaScript",
"langColor": "#f1e05a",
"fork": 0,
"star": 0,
"updated": 1554833781.0,
"license": "MIT License",
"forkedFrom": ""
},
"pymet": {
"url": "https://github.com/itzmeanjan/pymet",
"description": "A simple to use Python Interface for fetching MET Norway Weather Data.",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 0,
"updated": 1545845751.0,
"license": "MIT License",
"forkedFrom": ""
},
"imd_weather": {
"url": "https://github.com/itzmeanjan/imd_weather",
"description": "A simple python API, helps you to fetch City Weather data from Indian Meteorological Dept.",
"lang": "Python",
"langColor": "#3572A5",
"fork": 1,
"star": 0,
"updated": 1545591535.0,
"license": "MIT License",
"forkedFrom": ""
},
"pyway2sms": {
"url": "https://github.com/itzmeanjan/pyway2sms",
"description": "Pythonic API for accessing Way2SMS, a programmable SMS sending service in India, features",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 0,
"updated": 1545383804.0,
"license": "MIT License",
"forkedFrom": ""
},
"yandex_translate": {
"url": "https://github.com/itzmeanjan/yandex_translate",
"description": "Yandex.Translate Service python API",
"lang": "Python",
"langColor": "#3572A5",
"fork": 0,
"star": 0,
"updated": 1545322725.0,
"license": "MIT License",
"forkedFrom": ""
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment