Skip to content

Instantly share code, notes, and snippets.

@nezuQ
Created June 5, 2014 17:42
Show Gist options
  • Save nezuQ/e32787072a5049cca8b9 to your computer and use it in GitHub Desktop.
Save nezuQ/e32787072a5049cca8b9 to your computer and use it in GitHub Desktop.
PDFマイニング事始め。Rパッケージマニュアルを実行可能ドキュメントにする。 ref: http://qiita.com/nezuq/items/3bbde44cf815ec5c18a6
## Package ‘ggplot2’
##
## May 21, 2014
##
## Type Package
##
## Title An implementation of the Grammar of Graphics
##
## Version 1.0.0
##
(中略)
## Examples
##
# xlim
xlim(15, 20)
xlim(20, 15)
xlim(c(10, 20))
xlim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + xlim(15, 20)
# with automatic lower limit
qplot(mpg, wt, data=mtcars) + xlim(NA, 20)
# ylim
ylim(15, 20)
ylim(c(10, 20))
ylim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + ylim(0, 4)
# with automatic upper limit
qplot(mpg, wt, data=mtcars) + ylim(0, NA)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
reload(sys)
sys.setdefaultencoding('UTF-8')
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
HEADER_Y0 = 709
LASTINDEX_H = 32
FUNCTITLE_X0 = 110
argvs = sys.argv
argc = len(argvs)
INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''
fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
isidxsec = False
isexsec = False
befobj = None
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for obj in layout:
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text = obj.get_text()
if text.strip() == 'R topics documented:':
isidxsec = True
if isidxsec:
if text.strip() == 'Index':
isidxsec = False
else:
if text.strip() == 'Index' and LASTINDEX_H < obj.height:
isidxsec = True
elif(obj.bbox[1] < HEADER_Y0):
if isexsec and int(obj.bbox[0]) == FUNCTITLE_X0:
isexsec = False
text_content.append(text if isexsec else '## ' + text.replace('\n','\n## '))
if text.strip() == 'Examples':
isexsec = True
befobj = obj
print '\n'.join(text_content)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import sys
argvs = sys.argv
argc = len(argvs)
INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''
fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)
outlines = list(document.get_outlines())[0:-1]
for (level,title,dest,a,se) in outlines:
print title
python rpdf2cmd.py ggplot2.pdf > result.txt
python rpdf2fnclist.py ggplot2.pdf > result.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment