amotl/dwd_description_pdf.py

## dwd_description_pdf.py
"""
Setup::

    pip install PyPDF2 tabulate

Synopsis::

    python dwd_description_pdf.py
"""
import re
import json
from io import StringIO, BytesIO
import requests
import PyPDF2
from tabulate import tabulate


def read_pdf(url):
    text = StringIO()
    payload = requests.get(url).content
    pdf = PyPDF2.PdfFileReader(BytesIO(payload))
    for page_number in range(pdf.numPages):
        page = pdf.getPage(page_number)
        result = page.extractText()
        result = re.sub('www\.dwd\.de\n-\n\d+\n-\n', '', result)
        text.write(result)
    return text.getvalue()


def parse_section(text, headline):
    capture = False
    buffer = StringIO()
    for line in text.split('\n'):
        if headline in line:
            capture = True
        if line == ' ':
            capture = False
        if capture:
            buffer.write(line)
            buffer.write('\n')
    payload = buffer.getvalue()
    return payload


def parse_parameters(text):
    data = {}
    parameter = None
    capture = False
    buffer = StringIO()
    for line in text.split('\n'):

        if line == line.upper() and not line.isnumeric():
            if line != parameter:
                more = buffer.getvalue()
                if more and 'eor' not in more:
                    more = more.strip()
                    if parameter not in ['RSKF']:
                        more = more.replace('\n', ' ')
                    data[parameter] = more
                buffer.truncate(0)
                buffer.seek(0)
            parameter = line
            capture = True

        else:
            if capture:
                buffer.write(line)
                buffer.write('\n')
    return data


def process(url):
    document = read_pdf(url)
    parameters_text = parse_section(document, 'Parameters')
    parameters = parse_parameters(parameters_text)

    # Output as JSON.
    #print(json.dumps(parameters, indent=4))

    # Output as ASCII table.
    print(tabulate(list(parameters.items()), tablefmt="psql"))


if __name__ == '__main__':
    ten_minutes_air = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/recent/DESCRIPTION_obsgermany_climate_10min_tu_recent_en.pdf'
    hourly_solar = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/solar/DESCRIPTION_obsgermany_climate_hourly_solar_en.pdf'
    daily_kl = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/recent/DESCRIPTION_obsgermany_climate_daily_kl_recent_en.pdf'

    for item in ten_minutes_air, hourly_solar, daily_kl:
        print(item)
        process(item)
        print()
	"""
	Setup::

	pip install PyPDF2 tabulate

	Synopsis::

	python dwd_description_pdf.py
	"""
	import re
	import json
	from io import StringIO, BytesIO
	import requests
	import PyPDF2
	from tabulate import tabulate


	def read_pdf(url):
	text = StringIO()
	payload = requests.get(url).content
	pdf = PyPDF2.PdfFileReader(BytesIO(payload))
	for page_number in range(pdf.numPages):
	page = pdf.getPage(page_number)
	result = page.extractText()
	result = re.sub('www\.dwd\.de\n-\n\d+\n-\n', '', result)
	text.write(result)
	return text.getvalue()


	def parse_section(text, headline):
	capture = False
	buffer = StringIO()
	for line in text.split('\n'):
	if headline in line:
	capture = True
	if line == ' ':
	capture = False
	if capture:
	buffer.write(line)
	buffer.write('\n')
	payload = buffer.getvalue()
	return payload


	def parse_parameters(text):
	data = {}
	parameter = None
	capture = False
	buffer = StringIO()
	for line in text.split('\n'):

	if line == line.upper() and not line.isnumeric():
	if line != parameter:
	more = buffer.getvalue()
	if more and 'eor' not in more:
	more = more.strip()
	if parameter not in ['RSKF']:
	more = more.replace('\n', ' ')
	data[parameter] = more
	buffer.truncate(0)
	buffer.seek(0)
	parameter = line
	capture = True

	else:
	if capture:
	buffer.write(line)
	buffer.write('\n')
	return data


	def process(url):
	document = read_pdf(url)
	parameters_text = parse_section(document, 'Parameters')
	parameters = parse_parameters(parameters_text)

	# Output as JSON.
	#print(json.dumps(parameters, indent=4))

	# Output as ASCII table.
	print(tabulate(list(parameters.items()), tablefmt="psql"))


	if __name__ == '__main__':
	ten_minutes_air = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/recent/DESCRIPTION_obsgermany_climate_10min_tu_recent_en.pdf'
	hourly_solar = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/solar/DESCRIPTION_obsgermany_climate_hourly_solar_en.pdf'
	daily_kl = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/recent/DESCRIPTION_obsgermany_climate_daily_kl_recent_en.pdf'

	for item in ten_minutes_air, hourly_solar, daily_kl:
	print(item)
	process(item)
	print()