nezuQ/result.txt

## result.txt
## Package ‘ggplot2’
##
## May 21, 2014
##
## Type Package
##
## Title An implementation of the Grammar of Graphics
##
## Version 1.0.0
##

（中略）

## Examples
##
# xlim
xlim(15, 20)
xlim(20, 15)
xlim(c(10, 20))
xlim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + xlim(15, 20)
# with automatic lower limit
qplot(mpg, wt, data=mtcars) + xlim(NA, 20)

# ylim
ylim(15, 20)
ylim(c(10, 20))
ylim("a", "b", "c")
qplot(mpg, wt, data=mtcars) + ylim(0, 4)
# with automatic upper limit
qplot(mpg, wt, data=mtcars) + ylim(0, NA)

## rpdf2cmd.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import sys
reload(sys)
sys.setdefaultencoding('UTF-8')

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed

HEADER_Y0 = 709
LASTINDEX_H = 32
FUNCTITLE_X0 = 110

argvs = sys.argv
argc = len(argvs)

INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''

fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

text_content = []

isidxsec = False
isexsec = False
befobj = None
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    for obj in layout:
        if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
            text = obj.get_text()
            if text.strip() == 'R topics documented:':
                isidxsec = True
            if isidxsec:
                if text.strip() == 'Index':
                    isidxsec = False
            else:
                if text.strip() == 'Index' and LASTINDEX_H < obj.height:
                    isidxsec = True
                elif(obj.bbox[1] < HEADER_Y0):
                    if isexsec and int(obj.bbox[0]) == FUNCTITLE_X0:
                        isexsec = False
                    text_content.append(text if isexsec else '## ' + text.replace('\n','\n## '))
                    if text.strip() == 'Examples':
                        isexsec = True
        befobj = obj
print '\n'.join(text_content)

## rpdf2fnclist.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import sys

argvs = sys.argv
argc = len(argvs)

INPUT_PDF = argvs[1].decode('UTF-8')
PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''

fp = open(INPUT_PDF, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, PASSWORD)

outlines = list(document.get_outlines())[0:-1]
for (level,title,dest,a,se) in outlines:
    print title

## 本文を抽出する.sh
python rpdf2cmd.py ggplot2.pdf > result.txt

## 目次を抽出する.sh
python rpdf2fnclist.py ggplot2.pdf > result.txt
	## Package ‘ggplot2’
	##
	## May 21, 2014
	##
	## Type Package
	##
	## Title An implementation of the Grammar of Graphics
	##
	## Version 1.0.0
	##

	（中略）

	## Examples
	##
	# xlim
	xlim(15, 20)
	xlim(20, 15)
	xlim(c(10, 20))
	xlim("a", "b", "c")
	qplot(mpg, wt, data=mtcars) + xlim(15, 20)
	# with automatic lower limit
	qplot(mpg, wt, data=mtcars) + xlim(NA, 20)

	# ylim
	ylim(15, 20)
	ylim(c(10, 20))
	ylim("a", "b", "c")
	qplot(mpg, wt, data=mtcars) + ylim(0, 4)
	# with automatic upper limit
	qplot(mpg, wt, data=mtcars) + ylim(0, NA)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	from __future__ import unicode_literals

	import sys
	reload(sys)
	sys.setdefaultencoding('UTF-8')

	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import PDFPageAggregator
	from pdfminer.layout import LAParams, LTTextBox, LTTextLine
	from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed

	HEADER_Y0 = 709
	LASTINDEX_H = 32
	FUNCTITLE_X0 = 110

	argvs = sys.argv
	argc = len(argvs)

	INPUT_PDF = argvs[1].decode('UTF-8')
	PASSWORD = argvs[2].decode('UTF-8') if 2 < argc else ''

	fp = open(INPUT_PDF, 'rb')
	parser = PDFParser(fp)
	document = PDFDocument(parser, PASSWORD)
	if not document.is_extractable:
	raise PDFTextExtractionNotAllowed
	rsrcmgr = PDFResourceManager()
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	text_content = []

	isidxsec = False
	isexsec = False
	befobj = None
	for page in PDFPage.create_pages(document):
	interpreter.process_page(page)
	layout = device.get_result()
	for obj in layout:
	if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
	text = obj.get_text()
	if text.strip() == 'R topics documented:':
	isidxsec = True
	if isidxsec:
	if text.strip() == 'Index':
	isidxsec = False
	else:
	if text.strip() == 'Index' and LASTINDEX_H < obj.height:
	isidxsec = True
	elif(obj.bbox[1] < HEADER_Y0):
	if isexsec and int(obj.bbox[0]) == FUNCTITLE_X0:
	isexsec = False
	text_content.append(text if isexsec else '## ' + text.replace('\n','\n## '))
	if text.strip() == 'Examples':
	isexsec = True
	befobj = obj
	print '\n'.join(text_content)