Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Jakość powietrza w Polsce #2 - Rok 2017 oczami oddychającego Polaka
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import glob\nimport os\nimport pandas as pd\nimport numpy as np\nimport geopy.distance\nfrom tqdm import tqdm",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "filenames = [ os.path.splitext(wholeFilename)[0] for wholeFilename in \n [ os.path.basename(wholePath) for wholePath in glob.glob(\"../input/2017/2*.xlsx\") ] ]\n\ndataFiles = pd.DataFrame({\"filename\": filenames})\ndataFiles[\"year\"], dataFiles[\"pollutant\"], dataFiles[\"resolution\"] = dataFiles[\"filename\"].str.split('_', 2).str",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "metadata = pd.read_excel(\"../input/Metadane_wer20180829.xlsx\")",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cities_and_coords = pd.read_csv(\"../output/cities_and_coords.csv\", index_col=0)\ncities_and_coords[\"coords\"] = cities_and_coords[\"coords\"].str[1:-1].str.split(\",\").map(lambda x: (float(x[0]),float(x[1])))",
"execution_count": 65,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "pollutants = [\"PM25\", \"PM10\", \"C6H6\", \"NO2\", \"SO2\", \"O3\", \"CO\"]\npollutants_dist = [x+\"_dist\" for x in pollutants]",
"execution_count": 101,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "files = dataFiles[(dataFiles[\"resolution\"] == \"1g\") & (dataFiles[\"pollutant\"].isin(pollutants))]",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cols = [\"city\"]\ncols.extend(pollutants)\ncols.extend(pollutants_dist)\nresults = pd.DataFrame(columns = cols)\nresults.set_index(\"city\", inplace = True)",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results",
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 8,
"data": {
"text/plain": "Empty DataFrame\nColumns: [PM25, PM10, C6H6, NO2, SO2, O3, CO, PM25_dist, PM10_dist, C6H6_dist, NO2_dist, SO2_dist, O3_dist, CO_dist]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PM25</th>\n <th>PM10</th>\n <th>C6H6</th>\n <th>NO2</th>\n <th>SO2</th>\n <th>O3</th>\n <th>CO</th>\n <th>PM25_dist</th>\n <th>PM10_dist</th>\n <th>C6H6_dist</th>\n <th>NO2_dist</th>\n <th>SO2_dist</th>\n <th>O3_dist</th>\n <th>CO_dist</th>\n </tr>\n <tr>\n <th>city</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#pollutant = \"PM25\"",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#pollutants = [\"NO2\"]\nfor pollutant in pollutants:\n print(pollutant)\n file = \"2017_{}_1g\".format(pollutant)\n data = pd.read_excel(\"../input/2017/{}.xlsx\".format(file), skiprows=[0,2,3,4,5], )\n data = data.rename(columns={\"Kod stacji\":\"Hour\"})\n data.set_index(\"Hour\", inplace = True)\n data = data.replace({',':'.'}, regex=True).astype(\"float\")\n\n #print(data.shape)\n data.dropna(axis = 1, how=\"all\", inplace=True)\n #print(data.shape)\n\n year_mean = data.mean()\n\n current_stations = metadata[metadata[\"Kod stacji\"].isin(year_mean.index)]\n\n assert year_mean.shape[0] == current_stations.shape[0]\n \n for city in tqdm(cities_and_coords[\"city\"]):\n\n lat = cities_and_coords[cities_and_coords[\"city\"] == city][\"coords\"].item()[0]\n lon = cities_and_coords[cities_and_coords[\"city\"] == city][\"coords\"].item()[1]\n\n coords_1 = (lat, lon)\n\n tmp_stations = pd.DataFrame()\n\n for index, row in current_stations.iterrows():\n coords_2 = (row[\"WGS84 φ N\"], row[\"WGS84 λ E\"])\n dist = geopy.distance.vincenty(coords_1, coords_2).km\n row['dist'] = dist\n tmp_stations = tmp_stations.append(row,ignore_index=False)\n\n nearest_station = tmp_stations.sort_values([\"dist\"]).iloc[0]\n\n pollution = year_mean[nearest_station[\"Kod stacji\"]]\n dist = nearest_station[\"dist\"]\n\n \n #Debug:\n if city in [\"blah\"]:\n print(\"----------\")\n print(city)\n print(\"Najbliższa stacja: {}\".format(nearest_station[\"Nazwa stacji\"]))\n print(\"Odległośc w km: {}\".format(dist))\n print(\"Średni roczny pomiar: {}\".format(pollution))\n print(\"Wspolrzedne z api: {}\".format(coords_1))\n \n \n results.at[city,pollutant] = pollution\n results.at[city,pollutant+\"_dist\"] = dist",
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"text": "PM25\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [09:18<00:00, 1.67it/s]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "PM10\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [26:54<00:00, 1.74s/it]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "C6H6\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [09:34<00:00, 1.62it/s]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "NO2\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [30:15<00:00, 1.95s/it]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "SO2\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [27:57<00:00, 1.80s/it]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "O3\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [21:31<00:00, 1.39s/it]\n",
"name": "stderr"
},
{
"output_type": "stream",
"text": "CO\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "100%|██████████| 930/930 [17:12<00:00, 1.11s/it]\n",
"name": "stderr"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "max_dist = results[[\"PM25_dist\", \"PM10_dist\", \"C6H6_dist\", \"NO2_dist\", \"SO2_dist\", \"O3_dist\", \"CO_dist\"]].max().max()",
"execution_count": 11,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#def calculate_weights(max_dist, tmp_df):\n# for pollutant in pollutants:\n# tmp_df[pollutant+\"_weight\"] = max_dist - tmp_df[pollutant+\"_dist\"]\n# return tmp_df",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from sklearn.preprocessing import MinMaxScaler\n\ndef calculate_weights(max_dist, tmp_df):\n scaler = MinMaxScaler()\n for pollutant in pollutants:\n print(\"Dla {} największa odległosć to {} km.\".format(pollutant,tmp_df[pollutant+\"_dist\"].max()))\n print(\"Dla {} najmniejsza odległosć to {} km.\".format(pollutant,tmp_df[pollutant+\"_dist\"].min()))\n tmp_df[pollutant+\"_weight\"] = scaler.fit_transform(tmp_df[pollutant+\"_dist\"].values.reshape(-1, 1))\n tmp_df[pollutant+\"_weight\"] = 1 - tmp_df[pollutant+\"_weight\"]\n return tmp_df",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results = calculate_weights(max_dist=max_dist, tmp_df=results)",
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": "Dla PM25 największa odległosć to 126.390478444573 km.\nDla PM25 najmniejsza odległosć to 0.15503437344974816 km.\nDla PM10 największa odległosć to 91.81186928346621 km.\nDla PM10 najmniejsza odległosć to 0.15503437344974816 km.\nDla C6H6 największa odległosć to 220.13928076476554 km.\nDla C6H6 najmniejsza odległosć to 0.15503437344974816 km.\nDla NO2 największa odległosć to 74.40068503102187 km.\nDla NO2 najmniejsza odległosć to 0.06033904330254862 km.\nDla SO2 największa odległosć to 74.40068503102187 km.\nDla SO2 najmniejsza odległosć to 0.06033904330254862 km.\nDla O3 największa odległosć to 106.06226195563012 km.\nDla O3 najmniejsza odległosć to 0.06033904330254862 km.\nDla CO największa odległosć to 116.03986566591892 km.\nDla CO najmniejsza odległosć to 0.15503437344974816 km.\n",
"name": "stdout"
},
{
"output_type": "stream",
"text": "/home/damian/miniconda3/envs/jakbadacdane.pl/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by MinMaxScaler.\n warnings.warn(msg, DataConversionWarning)\n",
"name": "stderr"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def normalize(tmp_df):\n tmp_df[\"PM25_norm\"] = tmp_df[\"PM25\"] / 121\n tmp_df[\"PM10_norm\"] = tmp_df[\"PM10\"] / 201 \n tmp_df[\"O3_norm\"] = tmp_df[\"O3\"] / 241\n tmp_df[\"NO2_norm\"] = tmp_df[\"NO2\"] / 401 \n tmp_df[\"SO2_norm\"] = tmp_df[\"SO2\"] / 501 \n tmp_df[\"C6H6_norm\"] = tmp_df[\"C6H6\"] / 51 \n tmp_df[\"CO_norm\"] = tmp_df[\"CO\"] / 21\n return tmp_df",
"execution_count": 15,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results = normalize(tmp_df=results)",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results[\"weighted_avg\"] = results.apply(lambda x: \n (x[\"PM25_norm\"] * x[\"PM25_weight\"] + \n x[\"PM10_norm\"] * x[\"PM10_weight\"] +\n x[\"O3_norm\"] * x[\"O3_weight\"] + \n x[\"NO2_norm\"] * x[\"NO2_weight\"] +\n x[\"SO2_norm\"] * x[\"SO2_weight\"] +\n x[\"C6H6_norm\"] * x[\"C6H6_weight\"] +\n x[\"CO_norm\"] * x[\"CO_weight\"]) /\n (x[\"PM25_weight\"] + \n x[\"PM10_weight\"] + \n x[\"O3_weight\"] + \n x[\"NO2_weight\"] +\n x[\"SO2_weight\"] + \n x[\"C6H6_weight\"] +\n x[\"CO_weight\"]), axis=1)",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def PM10(val):\n if val < 21:\n color = \"#009900\"\n elif (val >= 21) & (val < 61):\n color = \"#99ff33\"\n elif (val >= 61) & (val < 101):\n color = \"#ffff00\"\n elif (val >= 101) & (val < 141):\n color = \"#ff6600\"\n elif (val >= 141) & (val < 201):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef PM25(val):\n if val < 13:\n color = \"#009900\"\n elif (val >= 13) & (val < 37):\n color = \"#99ff33\"\n elif (val >= 37) & (val < 61):\n color = \"#ffff00\"\n elif (val >= 61) & (val < 85):\n color = \"#ff6600\"\n elif (val >= 85) & (val < 121):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef SO2(val):\n if val < 51:\n color = \"#009900\"\n elif (val >= 51) & (val < 101):\n color = \"#99ff33\"\n elif (val >= 101) & (val < 201):\n color = \"#ffff00\"\n elif (val >= 201) & (val < 351):\n color = \"#ff6600\"\n elif (val >= 351) & (val < 501):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef O3(val):\n if val < 71:\n color = \"#009900\"\n elif (val >= 71) & (val < 121):\n color = \"#99ff33\"\n elif (val >= 121) & (val < 151):\n color = \"#ffff00\"\n elif (val >= 151) & (val < 181):\n color = \"#ff6600\"\n elif (val >= 181) & (val < 241):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef C6H6(val):\n if val < 6:\n color = \"#009900\"\n elif (val >= 6) & (val < 11):\n color = \"#99ff33\"\n elif (val >= 11) & (val < 16):\n color = \"#ffff00\"\n elif (val >= 16) & (val < 21):\n color = \"#ff6600\"\n elif (val >= 21) & (val < 51):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef CO(val):\n if val < 3000:\n color = \"#009900\"\n elif (val >= 3000) & (val < 7000):\n color = \"#99ff33\"\n elif (val >= 7000) & (val < 11000):\n color = \"#ffff00\"\n elif (val >= 11000) & (val < 15000):\n color = \"#ff6600\"\n elif (val >= 15000) & (val < 21000):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color\n\ndef NO2(val):\n if val < 41:\n color = \"#009900\"\n elif (val >= 41) & (val < 101):\n color = \"#99ff33\"\n elif (val >= 101) & (val < 151):\n color = \"#ffff00\"\n elif (val >= 151) & (val < 201):\n color = \"#ff6600\"\n elif (val >= 201) & (val < 401):\n color = \"#ff0000\"\n else:\n color = \"#990000\"\n return 'background-color: %s' % color",
"execution_count": 18,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results.sort_values([\"weighted_avg\"], ascending = False, inplace=True)",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "results.style.applymap(PM10, subset=['PM10']).applymap(PM25, subset=['PM25']). \\\napplymap(SO2, subset=['SO2']).applymap(O3, subset=['O3']).applymap(C6H6, subset=['C6H6']). \\\napplymap(CO, subset=['CO']).applymap(NO2, subset=['NO2'])",
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 20,
"data": {
"text/plain": "<pandas.io.formats.style.Styler at 0x7f0ed7b6abe0>",
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment