Skip to content

Instantly share code, notes, and snippets.

Created June 5, 2014 17:42
Show Gist options
  • Save nezuQ/e32787072a5049cca8b9 to your computer and use it in GitHub Desktop.
Save nezuQ/e32787072a5049cca8b9 to your computer and use it in GitHub Desktop.
PDFマイニング事始め。Rパッケージマニュアルを実行可能ドキュメントにする。 ref:
## Package ‘ggplot2’
## May 21, 2014
## Type Package
## Title An implementation of the Grammar of Graphics
## Version 1.0.0
## Examples
# xlim
xlim(15, 20)
xlim(20, 15)
xlim(c(10, 20))
xlim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + xlim(15, 20)
# with automatic lower limit
qplot(mpg, wt, data=mtcars) + xlim(NA, 20)
# ylim
ylim(15, 20)
ylim(c(10, 20))
ylim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + ylim(0, 4)
# with automatic upper limit
qplot(mpg, wt, data=mtcars) + ylim(0, NA)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
HEADER_Y0 = 709
argvs = sys.argv
argc = len(argvs)
INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''
fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text_content = []
isidxsec = False
isexsec = False
befobj = None
for page in PDFPage.create_pages(document):
layout = device.get_result()
for obj in layout:
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
text = obj.get_text()
if text.strip() == 'R topics documented:':
isidxsec = True
if isidxsec:
if text.strip() == 'Index':
isidxsec = False
if text.strip() == 'Index' and LASTINDEX_H < obj.height:
isidxsec = True
elif(obj.bbox[1] < HEADER_Y0):
if isexsec and int(obj.bbox[0]) == FUNCTITLE_X0:
isexsec = False
text_content.append(text if isexsec else '## ' + text.replace('\n','\n## '))
if text.strip() == 'Examples':
isexsec = True
befobj = obj
print '\n'.join(text_content)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import sys
argvs = sys.argv
argc = len(argvs)
INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''
fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)
outlines = list(document.get_outlines())[0:-1]
for (level,title,dest,a,se) in outlines:
print title
python ggplot2.pdf > result.txt
python ggplot2.pdf > result.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment