iamaziz/gensim-to-docset.py

## gensim-to-docset.py
# generate gensim docset
# http://radimrehurek.com/gensim/

#----------------------------------
# built-in packages
import sqlite3
import os
import urllib
import plistlib

#----------------------------------
# third party packages + httrack
import requests
from bs4 import BeautifulSoup as bs

# prepare docset folder and html
def setup_docset(doc_name, url, download_html=False):

  # docset settings
  docset_name = '{}.docset'.format(doc_name)
  output = docset_name + '/Contents/Resources/Documents/'

  # docset directory
  if not os.path.exists(output):
      os.makedirs(output)


  cmd_command = """
  cd {0} &&
  httrack -%v2 -T60 -R99 --sockets=7 -%c1000 -c10 -A999999999 -%N0 --disable-security-limits -F 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.168' --mirror --keep-alive --robots=0 "{1}" -n -* +*.css +*css.php +*.ico +*/fonts/* +*.svg +*.ttf +fonts.googleapis.com* +*.woff +*.eot +*.png +*.jpg +*.gif +*.jpeg +*.js +{1}* -github.com* +raw.github.com* &&
  rm -rf hts-* &&
  mkdir -p Contents/Resources/Documents &&
  mv -f *.* Contents/Resources/Documents/
  """.format(docset_name, url)

  if download_html:
    os.system(cmd_command)

  return docset_name


def update_db(name, typ, path):
  try:
    cur.execute("SELECT rowid FROM searchIndex WHERE path = ?", (path,))
    dbpath = cur.fetchone()
    cur.execute("SELECT rowid FROM searchIndex WHERE name = ?", (name,))
    dbname = cur.fetchone()

    if dbpath is None and dbname is None:
        cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, typ, path))
        print('DB add >> name: {0} | type: {1} | path: {2}'.format(name, typ, path))
    else:
        print("record exists")
  except:
    pass


def add_infoplist(base_page, docset_name):

  index_file = base_page.split("//")[1]
  name = docset_name.split('.')[0]

  plist_path = os.path.join(docset_name, "Contents", "Info.plist")
  plist_cfg = {
      'CFBundleIdentifier': name,
      'CFBundleName': name,
      'DocSetPlatformFamily': name.lower(),
      'DashDocSetFamily': 'python',
      'isDashDocset': True,
      'dashIndexFilePath': index_file
  }
  plistlib.writePlist(plist_cfg, plist_path)


def add_urls(pages):

  # loop through index pages:
  for p in pages:

    # setup paths
    page_name = pages[p].split('/')[-1]
    base_path = pages[p].split("//")[1]

    # soup each index page
    html = requests.get(pages[p]).text
    soup = bs(html)

    for a in soup.findAll('a', class_='reference internal'):
      name = a.text.strip()
      path = a.get('href')
      name = " ".join(name.split())

      f = ['Guide', 'Library']
      if path is not None and p not in f:
        path = base_path + path
        update_db(name, p, path)
      if p in f:
        path = base_path.replace(page_name, '') + path
        update_db(name, p, path)


def main():

  # index pages
  name = 'gensim'
  website = 'http://radimrehurek.com/gensim/'
  pages = {
      'Section' : website,
      'Guide'   : 'http://radimrehurek.com/gensim/tutorial.html',
      'Library' : 'http://radimrehurek.com/gensim/apiref.html'
  }

  docset_name = setup_docset(name, website, download_html=True)

  # docset icon
  icon = 'https://www.python.org/static/apple-touch-icon-precomposed.png'
  urllib.urlretrieve(icon, docset_name + "/icon.png")


  # create and connect to SQLite
  db = sqlite3.connect(docset_name + '/Contents/Resources/docSet.dsidx')
  global cur
  cur = db.cursor()
  try:
      cur.execute('DROP TABLE searchIndex;')
  except:
      cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
      cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')

  # docset entries
  add_urls(pages)
  add_infoplist(website, docset_name)

  # report num of entries
  cur.execute('Select count(*) from searchIndex;')
  entry = cur.fetchone()
  print("{} entry.".format(entry))

  # commit and close db
  db.commit()
  db.close()

if __name__ == '__main__':
  main()
	# generate gensim docset
	# http://radimrehurek.com/gensim/

	#----------------------------------
	# built-in packages
	import sqlite3
	import os
	import urllib
	import plistlib

	#----------------------------------
	# third party packages + httrack
	import requests
	from bs4 import BeautifulSoup as bs

	# prepare docset folder and html
	def setup_docset(doc_name, url, download_html=False):

	# docset settings
	docset_name = '{}.docset'.format(doc_name)
	output = docset_name + '/Contents/Resources/Documents/'

	# docset directory
	if not os.path.exists(output):
	os.makedirs(output)


	cmd_command = """
	cd {0} &&
	httrack -%v2 -T60 -R99 --sockets=7 -%c1000 -c10 -A999999999 -%N0 --disable-security-limits -F 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.168' --mirror --keep-alive --robots=0 "{1}" -n -* +.css +css.php +.ico +/fonts/* +.svg +.ttf +fonts.googleapis.com* +.woff +.eot +.png +.jpg +.gif +.jpeg +.js +{1} -github.com* +raw.github.com* &&
	rm -rf hts-* &&
	mkdir -p Contents/Resources/Documents &&
	mv -f . Contents/Resources/Documents/
	""".format(docset_name, url)

	if download_html:
	os.system(cmd_command)

	return docset_name


	def update_db(name, typ, path):
	try:
	cur.execute("SELECT rowid FROM searchIndex WHERE path = ?", (path,))
	dbpath = cur.fetchone()
	cur.execute("SELECT rowid FROM searchIndex WHERE name = ?", (name,))
	dbname = cur.fetchone()

	if dbpath is None and dbname is None:
	cur.execute('INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES (?,?,?)', (name, typ, path))
	print('DB add >> name: {0} \| type: {1} \| path: {2}'.format(name, typ, path))
	else:
	print("record exists")
	except:
	pass


	def add_infoplist(base_page, docset_name):

	index_file = base_page.split("//")[1]
	name = docset_name.split('.')[0]

	plist_path = os.path.join(docset_name, "Contents", "Info.plist")
	plist_cfg = {
	'CFBundleIdentifier': name,
	'CFBundleName': name,
	'DocSetPlatformFamily': name.lower(),
	'DashDocSetFamily': 'python',
	'isDashDocset': True,
	'dashIndexFilePath': index_file
	}
	plistlib.writePlist(plist_cfg, plist_path)


	def add_urls(pages):

	# loop through index pages:
	for p in pages:

	# setup paths
	page_name = pages[p].split('/')[-1]
	base_path = pages[p].split("//")[1]

	# soup each index page
	html = requests.get(pages[p]).text
	soup = bs(html)

	for a in soup.findAll('a', class_='reference internal'):
	name = a.text.strip()
	path = a.get('href')
	name = " ".join(name.split())

	f = ['Guide', 'Library']
	if path is not None and p not in f:
	path = base_path + path
	update_db(name, p, path)
	if p in f:
	path = base_path.replace(page_name, '') + path
	update_db(name, p, path)



	def main():

	# index pages
	name = 'gensim'
	website = 'http://radimrehurek.com/gensim/'
	pages = {
	'Section' : website,
	'Guide' : 'http://radimrehurek.com/gensim/tutorial.html',
	'Library' : 'http://radimrehurek.com/gensim/apiref.html'
	}

	docset_name = setup_docset(name, website, download_html=True)

	# docset icon
	icon = 'https://www.python.org/static/apple-touch-icon-precomposed.png'
	urllib.urlretrieve(icon, docset_name + "/icon.png")


	# create and connect to SQLite
	db = sqlite3.connect(docset_name + '/Contents/Resources/docSet.dsidx')
	global cur
	cur = db.cursor()
	try:
	cur.execute('DROP TABLE searchIndex;')
	except:
	cur.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
	cur.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')

	# docset entries
	add_urls(pages)
	add_infoplist(website, docset_name)

	# report num of entries
	cur.execute('Select count(*) from searchIndex;')
	entry = cur.fetchone()
	print("{} entry.".format(entry))

	# commit and close db
	db.commit()
	db.close()

	if __name__ == '__main__':
	main()