Skip to content

Instantly share code, notes, and snippets.

@AlexKaravaev
Last active July 29, 2019 17:33
Show Gist options
  • Save AlexKaravaev/617716b103191b4a4e11691d7a1cfeac to your computer and use it in GitHub Desktop.
Save AlexKaravaev/617716b103191b4a4e11691d7a1cfeac to your computer and use it in GitHub Desktop.
datafiles = [f for f in listdir('./data/') if isfile(join('./data/', f))]
columns = ['Дата', 'Адрес', 'Оксид углерода', 'Оксид азота', 'Диоксид азота', 'Диоксид серы', 'Озон', 'Взвешенные частицы PM10', 'Взвешенные частицы PM2.5']
def addDate(fn):
df = pd.DataFrame(columns=columns)
with open(abspath('data/'+fn), 'rb') as f:
pollution_report = Document(f)
adr_cnt = 0
for i, paragraph in enumerate(pollution_report.paragraphs):
row_df = dict.fromkeys(columns)
adress = re.findall("адресу: (.*?)\\(станции", paragraph.text)
if adress:
row_df['Адрес'] = adress[0]
row_df['Дата'] = fn.split('.')[0]
for j in range(1, len(pollution_report.tables[adr_cnt].rows)):
row = pollution_report.tables[adr_cnt].rows[j]
row_text = []
for cell in row.cells:
row_text.append(cell.text)
row_df[row_text[0]] = row_text[1]
df = df.append(row_df, ignore_index=True)
adr_cnt += 1
return df
# Filter out adresses for better geocoding search
def filter_adress(adr):
expanded = functools.reduce(lambda x, y: x.replace(y, abbr[y]), abbr, adr).rstrip()
tokens = expanded.split(' ')
if ('город' not in tokens and 'поселок' not in tokens):
expanded = 'Санкт-Петербург, ' + expanded
expanded = 'Россия, ' + expanded
return expanded
abbr = {'пр.':'проспект', 'ул.': 'улица', 'В.О.': '', 'пер.': 'переулок', 'г.': 'город', 'пос.': 'поселок', 'М.': 'Максима'}
df['Адрес'] = df['Адрес'].map(filter_adress)
adresses = df['Адрес'].unique()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment