Created
June 5, 2014 17:42
-
-
Save nezuQ/e32787072a5049cca8b9 to your computer and use it in GitHub Desktop.
PDFマイニング事始め。Rパッケージマニュアルを実行可能ドキュメントにする。 ref: http://qiita.com/nezuq/items/3bbde44cf815ec5c18a6
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Package ‘ggplot2’ | |
## | |
## May 21, 2014 | |
## | |
## Type Package | |
## | |
## Title An implementation of the Grammar of Graphics | |
## | |
## Version 1.0.0 | |
## | |
(中略) | |
## Examples | |
## | |
# xlim | |
xlim(15, 20) | |
xlim(20, 15) | |
xlim(c(10, 20)) | |
xlim("a", "b", "c") | |
qplot(mpg, wt, data=mtcars) + xlim(15, 20) | |
# with automatic lower limit | |
qplot(mpg, wt, data=mtcars) + xlim(NA, 20) | |
# ylim | |
ylim(15, 20) | |
ylim(c(10, 20)) | |
ylim("a", "b", "c") | |
qplot(mpg, wt, data=mtcars) + ylim(0, 4) | |
# with automatic upper limit | |
qplot(mpg, wt, data=mtcars) + ylim(0, NA) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('UTF-8') | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.layout import LAParams, LTTextBox, LTTextLine | |
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed | |
HEADER_Y0 = 709 | |
LASTINDEX_H = 32 | |
FUNCTITLE_X0 = 110 | |
argvs = sys.argv | |
argc = len(argvs) | |
INPUT_PDF = argvs[1].decode('UTF-8') | |
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else '' | |
fp = open(INPUT_PDF, 'rb') | |
parser = PDFParser(fp) | |
document = PDFDocument(parser, PASSWORD) | |
if not document.is_extractable: | |
raise PDFTextExtractionNotAllowed | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
text_content = [] | |
isidxsec = False | |
isexsec = False | |
befobj = None | |
for page in PDFPage.create_pages(document): | |
interpreter.process_page(page) | |
layout = device.get_result() | |
for obj in layout: | |
if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine): | |
text = obj.get_text() | |
if text.strip() == 'R topics documented:': | |
isidxsec = True | |
if isidxsec: | |
if text.strip() == 'Index': | |
isidxsec = False | |
else: | |
if text.strip() == 'Index' and LASTINDEX_H < obj.height: | |
isidxsec = True | |
elif(obj.bbox[1] < HEADER_Y0): | |
if isexsec and int(obj.bbox[0]) == FUNCTITLE_X0: | |
isexsec = False | |
text_content.append(text if isexsec else '## ' + text.replace('\n','\n## ')) | |
if text.strip() == 'Examples': | |
isexsec = True | |
befobj = obj | |
print '\n'.join(text_content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import unicode_literals | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
import sys | |
argvs = sys.argv | |
argc = len(argvs) | |
INPUT_PDF = argvs[1].decode('UTF-8') | |
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else '' | |
fp = open(INPUT_PDF, 'rb') | |
parser = PDFParser(fp) | |
document = PDFDocument(parser, PASSWORD) | |
outlines = list(document.get_outlines())[0:-1] | |
for (level,title,dest,a,se) in outlines: | |
print title |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python rpdf2cmd.py ggplot2.pdf > result.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python rpdf2fnclist.py ggplot2.pdf > result.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment