Skip to content

Instantly share code, notes, and snippets.

@amotl
Created July 7, 2020 23:37
Show Gist options
  • Save amotl/ffc9d4b864660c93bed53d59547a5fce to your computer and use it in GitHub Desktop.
Save amotl/ffc9d4b864660c93bed53d59547a5fce to your computer and use it in GitHub Desktop.
Parse description PDF files from DWD CDC server
"""
Setup::
pip install PyPDF2 tabulate
Synopsis::
python dwd_description_pdf.py
"""
import re
import json
from io import StringIO, BytesIO
import requests
import PyPDF2
from tabulate import tabulate
def read_pdf(url):
text = StringIO()
payload = requests.get(url).content
pdf = PyPDF2.PdfFileReader(BytesIO(payload))
for page_number in range(pdf.numPages):
page = pdf.getPage(page_number)
result = page.extractText()
result = re.sub('www\.dwd\.de\n-\n\d+\n-\n', '', result)
text.write(result)
return text.getvalue()
def parse_section(text, headline):
capture = False
buffer = StringIO()
for line in text.split('\n'):
if headline in line:
capture = True
if line == ' ':
capture = False
if capture:
buffer.write(line)
buffer.write('\n')
payload = buffer.getvalue()
return payload
def parse_parameters(text):
data = {}
parameter = None
capture = False
buffer = StringIO()
for line in text.split('\n'):
if line == line.upper() and not line.isnumeric():
if line != parameter:
more = buffer.getvalue()
if more and 'eor' not in more:
more = more.strip()
if parameter not in ['RSKF']:
more = more.replace('\n', ' ')
data[parameter] = more
buffer.truncate(0)
buffer.seek(0)
parameter = line
capture = True
else:
if capture:
buffer.write(line)
buffer.write('\n')
return data
def process(url):
document = read_pdf(url)
parameters_text = parse_section(document, 'Parameters')
parameters = parse_parameters(parameters_text)
# Output as JSON.
#print(json.dumps(parameters, indent=4))
# Output as ASCII table.
print(tabulate(list(parameters.items()), tablefmt="psql"))
if __name__ == '__main__':
ten_minutes_air = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/recent/DESCRIPTION_obsgermany_climate_10min_tu_recent_en.pdf'
hourly_solar = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/solar/DESCRIPTION_obsgermany_climate_hourly_solar_en.pdf'
daily_kl = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/recent/DESCRIPTION_obsgermany_climate_daily_kl_recent_en.pdf'
for item in ten_minutes_air, hourly_solar, daily_kl:
print(item)
process(item)
print()
@amotl
Copy link
Author

amotl commented Jul 10, 2020

Output

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/10_minutes/air_temperature/recent/DESCRIPTION_obsgermany_climate_10min_tu_recent_en.pdf

+-------------+--------------------------------------------------------------------------+
| STATIONS_ID | station identification number                                            |
| MESS_DATUM  | measurement time yyyymmddhhmi                                            |
| QN          | quality level of next columns coding see paragraph "Quality information" |
| PP_10       | pressure at station height hPa                                           |
| TT_10       | air temperature at 2m height                                             |
| TM5_10      | air temperature at 5cm height                                            |
| RF_10       | relative humidity at 2m height                                           |
| TD_10       | dew point temperature at 2m height                                       |
+-------------+--------------------------------------------------------------------------+

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/solar/DESCRIPTION_obsgermany_climate_hourly_solar_en.pdf

+-------------+--------------------------------------------------------------------------+
| STATIONS_ID | station identification number                                            |
| MESS_DATUM  | end of interval in UTC yyyymmddhh:mm                                     |
| QN_592      | quality level of next columns coding see paragraph "Quality information" |
| ATMO_STRAHL | hourly sum of longwave downward radiation J/cm^2                         |
| FD_STRAHL   | hourly sum of diffuse solar radiation J/cm^2                             |
| FG_STRAHL   | hourly sum of solar incoming radiation J/cm^2                            |
| SD_STRAHL   | hourly sum of sunshine duration min                                      |
| ZENITH      | solar zenith angle at mid of interval degree                             |
+-------------+--------------------------------------------------------------------------+

https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/recent/DESCRIPTION_obsgermany_climate_daily_kl_recent_en.pdf

+-------------+--------------------------------------------------------------------------+
| STATIONS_ID | station id                                                               |
| MESS_DATUM  | date yyyymmdd                                                            |
| QN_3        | quality level of next columns coding see paragraph "Quality information" |
| FX          | daily maximum of wind gust m/s                                           |
| FM          | daily mean of wind velocity m/s                                          |
| QN_4        | quality level of next columns coding see paragraph "Quality information" |
| RSK         | daily precipitation height mm                                            |
| RSKF        | precipitation form                                                       |
|             | 0                                                                        |
|             | no precipitation                                                         |
|             | (conventional or automatic                                               |
|             | measurement), relates to                                                 |
|             | WMO code 10                                                              |
|             | 0                                                                        |
|             | 1                                                                        |
|             | only rain (before 1979)                                                  |
|             | 4                                                                        |
|             | unknown form of recorded                                                 |
|             | precipitation                                                            |
|             | 6                                                                        |
|             | only rain; only liquid                                                   |
|             | precipitation at automatic                                               |
|             | stations, relates to WMO                                                 |
|             | code 11                                                                  |
|             | 7                                                                        |
|             | only snow; only solid                                                    |
|             | precipitation at automatic                                               |
|             | stations, relates to WMO                                                 |
|             | code 12                                                                  |
|             | 8                                                                        |
|             | rain and snow (and/or                                                    |
|             | "Schneeregen"); liquid                                                   |
|             | and solid precipitation at                                               |
|             | automatic stations, relates                                              |
|             | to WMO code 13                                                           |
|             | 9                                                                        |
|             | error or missing                                                         |
|             | value or no automatic                                                    |
|             | determination of                                                         |
|             | precipitation form, relates                                              |
|             | to WMO code 15                                                           |
| SDK         | daily sunshine duration h                                                |
| SHK_TAG     | daily snow depth cm                                                      |
| NM          | daily mean of cloud cover                                                |
| VPM         | daily mean of vapor pressure hPa                                         |
| PM          | daily mean of pressure hPa                                               |
| TMK         | daily mean of temperature                                                |
| UPM         | daily mean of relative humidity                                          |
| TXK         | daily maximum of temperature at 2m height                                |
| TNK         | daily minimum of temperature at 2m height                                |
| TGK         | daily minimum of air temperature at 5cm above ground                     |
+-------------+--------------------------------------------------------------------------+

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment