Created
June 1, 2018 12:30
-
-
Save prithwi/6356b561aeb7e7dbd4a298885de90c8c to your computer and use it in GitHub Desktop.
Notebook to parse list of accepted KDD papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from lxml import html\n", | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tree = html.parse('./KDD 2018 _ Accepted Papers.htm')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Research" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Research - Oral" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>133</td>\n", | |
" <td>R2SDH: Robust Rotated Supervised Discrete Hashing</td>\n", | |
" <td>Jie Gui (Rutgers University); Ping Li (Baidu R...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>22</td>\n", | |
" <td>Smoothed Dilated Convolutions for Improved Den...</td>\n", | |
" <td>Zhengyang Wang (Washington State University); ...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>27</td>\n", | |
" <td>Discovering Non-Redundant K-means Clusterings ...</td>\n", | |
" <td>Dominik Mautz (Ludwig Maximilian University of...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>44</td>\n", | |
" <td>Trajectory-driven Influential Billboard Placement</td>\n", | |
" <td>Ping Zhang (Wuhan University); Zhifeng Bao (RM...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>83</td>\n", | |
" <td>Multi-Type Itemset Embedding for Learning Beha...</td>\n", | |
" <td>Daheng Wang (University of Notre Dame); Meng J...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"0 133 R2SDH: Robust Rotated Supervised Discrete Hashing \n", | |
"1 22 Smoothed Dilated Convolutions for Improved Den... \n", | |
"2 27 Discovering Non-Redundant K-means Clusterings ... \n", | |
"3 44 Trajectory-driven Influential Billboard Placement \n", | |
"4 83 Multi-Type Itemset Embedding for Learning Beha... \n", | |
"\n", | |
" Authors Track \n", | |
"0 Jie Gui (Rutgers University); Ping Li (Baidu R... Research - Oral \n", | |
"1 Zhengyang Wang (Washington State University); ... Research - Oral \n", | |
"2 Dominik Mautz (Ludwig Maximilian University of... Research - Oral \n", | |
"3 Ping Zhang (Wuhan University); Zhifeng Bao (RM... Research - Oral \n", | |
"4 Daheng Wang (University of Notre Dame); Meng J... Research - Oral " | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res_oral_xpath = '/html/body/main/div[1]/section/div[1]/div/div[1]/div/ul'\n", | |
"\n", | |
"res_oral_list = tree.xpath(res_oral_xpath)[0].xpath('li/div/span/text()')\n", | |
"\n", | |
"res_oral_df = pd.DataFrame(np.array(res_oral_list)\n", | |
" .reshape(-1, 3))\n", | |
"res_oral_df = pd.concat((res_oral_df[0].str.split('^(\\d*):', \n", | |
" expand=True)[[1,2]],\n", | |
" res_oral_df[2]),\n", | |
" axis=1)\n", | |
"res_oral_df.columns = ['ID', 'Title', 'Authors']\n", | |
"\n", | |
"res_oral_df['ID'] = res_oral_df.ID.str.strip()\n", | |
"res_oral_df['Title'] = res_oral_df.Title.str.strip()\n", | |
"res_oral_df['Authors'] = res_oral_df.Authors.str.strip()\n", | |
"\n", | |
"res_oral_df['Track'] = \"Research - Oral\"\n", | |
"res_oral_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>102</th>\n", | |
" <td>1418</td>\n", | |
" <td>Investor-Imitator: A Framework for Trading Kno...</td>\n", | |
" <td>Yi Ding (Nanjing University of Aeronautics and...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>103</th>\n", | |
" <td>1437</td>\n", | |
" <td>Discrete Ranking-based Matrix Factorization wi...</td>\n", | |
" <td>Yan Zhang (University of Science and Technolog...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>104</th>\n", | |
" <td>1494</td>\n", | |
" <td>LARC: Learning Activity-Regularized overlappin...</td>\n", | |
" <td>Alexander Gorovits (University at Albany-SUNY)...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>105</th>\n", | |
" <td>1540</td>\n", | |
" <td>EvoGraph: An Effective and Efficient Graph Ups...</td>\n", | |
" <td>Himchan Park (DGIST); Min-Soo Kim (DGIST)</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>106</th>\n", | |
" <td>1596</td>\n", | |
" <td>Training Big Random Forests with Little Resources</td>\n", | |
" <td>Fabian Gieseke (University of Copenhagen); Chr...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"102 1418 Investor-Imitator: A Framework for Trading Kno... \n", | |
"103 1437 Discrete Ranking-based Matrix Factorization wi... \n", | |
"104 1494 LARC: Learning Activity-Regularized overlappin... \n", | |
"105 1540 EvoGraph: An Effective and Efficient Graph Ups... \n", | |
"106 1596 Training Big Random Forests with Little Resources \n", | |
"\n", | |
" Authors Track \n", | |
"102 Yi Ding (Nanjing University of Aeronautics and... Research - Oral \n", | |
"103 Yan Zhang (University of Science and Technolog... Research - Oral \n", | |
"104 Alexander Gorovits (University at Albany-SUNY)... Research - Oral \n", | |
"105 Himchan Park (DGIST); Min-Soo Kim (DGIST) Research - Oral \n", | |
"106 Fabian Gieseke (University of Copenhagen); Chr... Research - Oral " | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res_oral_df.tail()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Research - poster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>62</td>\n", | |
" <td>Large-Scale Learnable Graph Convolutional Netw...</td>\n", | |
" <td>Hongyang Gao (Washington State University); Zh...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>64</td>\n", | |
" <td>Online Adaptive Asymmetric Active Learning for...</td>\n", | |
" <td>Yifan Zhang (South China University of Technol...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>148</td>\n", | |
" <td>Multi-Label Inference for Crowdsourcing</td>\n", | |
" <td>Jing Zhang (Nanjing University of Science and ...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>182</td>\n", | |
" <td>Deep Adversarial Learning for Multi-Modality M...</td>\n", | |
" <td>Lei Cai (Washington State University); Zhengya...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>227</td>\n", | |
" <td>SPARC: Self-Paced Network Representation for F...</td>\n", | |
" <td>Dawei Zhou (Arizona State University); Jingrui...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"0 62 Large-Scale Learnable Graph Convolutional Netw... \n", | |
"1 64 Online Adaptive Asymmetric Active Learning for... \n", | |
"2 148 Multi-Label Inference for Crowdsourcing \n", | |
"3 182 Deep Adversarial Learning for Multi-Modality M... \n", | |
"4 227 SPARC: Self-Paced Network Representation for F... \n", | |
"\n", | |
" Authors Track \n", | |
"0 Hongyang Gao (Washington State University); Zh... Research - poster \n", | |
"1 Yifan Zhang (South China University of Technol... Research - poster \n", | |
"2 Jing Zhang (Nanjing University of Science and ... Research - poster \n", | |
"3 Lei Cai (Washington State University); Zhengya... Research - poster \n", | |
"4 Dawei Zhou (Arizona State University); Jingrui... Research - poster " | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res_poster_xpath = '/html/body/main/div[1]/section/div[1]/div/div[2]/div/ul'\n", | |
"\n", | |
"res_poster_list = tree.xpath(res_poster_xpath)[0].xpath('li/div/span/text()')\n", | |
"\n", | |
"res_poster_df = pd.DataFrame(np.array(res_poster_list)\n", | |
" .reshape(-1, 3))\n", | |
"res_poster_df = pd.concat((res_poster_df[0].str.split('^(\\d*):', \n", | |
" expand=True)[[1,2]],\n", | |
" res_poster_df[2]),\n", | |
" axis=1)\n", | |
"res_poster_df.columns = ['ID', 'Title', 'Authors']\n", | |
"\n", | |
"res_poster_df['ID'] = res_poster_df.ID.str.strip()\n", | |
"res_poster_df['Title'] = res_poster_df.Title.str.strip()\n", | |
"res_poster_df['Authors'] = res_poster_df.Authors.str.strip()\n", | |
"\n", | |
"res_poster_df['Track'] = \"Research - poster\"\n", | |
"res_poster_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>69</th>\n", | |
" <td>1450</td>\n", | |
" <td>Prediction-time Efficient Classification Using...</td>\n", | |
" <td>Liang Zhao (George Mason University); Amir Ali...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>70</th>\n", | |
" <td>1495</td>\n", | |
" <td>Approximating the Spectrum of a Graph</td>\n", | |
" <td>David Cohen-Steiner (INRIA); Weihao Kong (Stan...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>71</th>\n", | |
" <td>1500</td>\n", | |
" <td>When Sentiment Analysis Meets Social Network: ...</td>\n", | |
" <td>Lin Gong (University of Virginia); Hongning Wa...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>72</th>\n", | |
" <td>1501</td>\n", | |
" <td>Latent variable time-varying network inference</td>\n", | |
" <td>Federico Tomasi (DIBRIS - Universita degli stu...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>73</th>\n", | |
" <td>1523</td>\n", | |
" <td>Stablizing Reinforcement Learning in Dynamic E...</td>\n", | |
" <td>Shi-Yong Chen (Nanjing University); Yang Yu (N...</td>\n", | |
" <td>Research - poster</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"69 1450 Prediction-time Efficient Classification Using... \n", | |
"70 1495 Approximating the Spectrum of a Graph \n", | |
"71 1500 When Sentiment Analysis Meets Social Network: ... \n", | |
"72 1501 Latent variable time-varying network inference \n", | |
"73 1523 Stablizing Reinforcement Learning in Dynamic E... \n", | |
"\n", | |
" Authors Track \n", | |
"69 Liang Zhao (George Mason University); Amir Ali... Research - poster \n", | |
"70 David Cohen-Steiner (INRIA); Weihao Kong (Stan... Research - poster \n", | |
"71 Lin Gong (University of Virginia); Hongning Wa... Research - poster \n", | |
"72 Federico Tomasi (DIBRIS - Universita degli stu... Research - poster \n", | |
"73 Shi-Yong Chen (Nanjing University); Yang Yu (N... Research - poster " | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"res_poster_df.tail()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Application" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Application - oral" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A-51</td>\n", | |
" <td>I Know You’ll Be Back: Interpretable New User ...</td>\n", | |
" <td>Carl Yang (University Of Illinois, Urbana Cham...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>A-594</td>\n", | |
" <td>PrePeP – A Tool for the Identification and Cha...</td>\n", | |
" <td>Maksim Koptelov (University of Caen Normandy);...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>A-88</td>\n", | |
" <td>Large-Scale Order Dispatch in On-Demand Ride-S...</td>\n", | |
" <td>Zhe Xu (AI Labs, Didi Chuxing); Zhixin Li (AI ...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A-186</td>\n", | |
" <td>WattHome: Identifying Energy-Inefficient Homes...</td>\n", | |
" <td>Srinivasan Iyengar (University of Massachusett...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>A-293</td>\n", | |
" <td>Perceive Your Users in Depth: Learning Univers...</td>\n", | |
" <td>Yabo Ni (Alibaba Group); Dan Ou (Alibaba Group...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"0 A-51 I Know You’ll Be Back: Interpretable New User ... \n", | |
"1 A-594 PrePeP – A Tool for the Identification and Cha... \n", | |
"2 A-88 Large-Scale Order Dispatch in On-Demand Ride-S... \n", | |
"3 A-186 WattHome: Identifying Energy-Inefficient Homes... \n", | |
"4 A-293 Perceive Your Users in Depth: Learning Univers... \n", | |
"\n", | |
" Authors Track \n", | |
"0 Carl Yang (University Of Illinois, Urbana Cham... Application - oral \n", | |
"1 Maksim Koptelov (University of Caen Normandy);... Application - oral \n", | |
"2 Zhe Xu (AI Labs, Didi Chuxing); Zhixin Li (AI ... Application - oral \n", | |
"3 Srinivasan Iyengar (University of Massachusett... Application - oral \n", | |
"4 Yabo Ni (Alibaba Group); Dan Ou (Alibaba Group... Application - oral " | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"app_oral_xpath = '/html/body/main/div[1]/section/div[1]/div/div[3]/div/ul'\n", | |
"\n", | |
"app_oral_list = tree.xpath(app_oral_xpath)[0].xpath('li/div/span/text()')\n", | |
"\n", | |
"app_oral_df = pd.DataFrame(np.array(app_oral_list)\n", | |
" .reshape(-1, 3))\n", | |
"app_oral_df = pd.concat((app_oral_df[0].str.split('^(A-\\d*):', \n", | |
" expand=True)[[1,2]],\n", | |
" app_oral_df[2]),\n", | |
" axis=1)\n", | |
"app_oral_df.columns = ['ID', 'Title', 'Authors']\n", | |
"\n", | |
"app_oral_df['ID'] = app_oral_df.ID.str.strip()\n", | |
"app_oral_df['Title'] = app_oral_df.Title.str.strip()\n", | |
"app_oral_df['Authors'] = app_oral_df.Authors.str.strip()\n", | |
"\n", | |
"app_oral_df['Track'] = \"Application - oral\"\n", | |
"app_oral_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>A-1158</td>\n", | |
" <td>ActiveRemediation: The Search for Lead Pipes i...</td>\n", | |
" <td>Jacob Abernethy (Georgia Institute of Technolo...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>36</th>\n", | |
" <td>A-1253</td>\n", | |
" <td>Winner’s Curse: Bias Estimation for Total Effe...</td>\n", | |
" <td>Minyong Lee (Airbnb); Milan Shen (Airbnb)</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td>A-1326</td>\n", | |
" <td>FireGuru: A real-time pipeline for spatio-temp...</td>\n", | |
" <td>Bhavkaran Swalia (Carnegie Mellon University);...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>A-1411</td>\n", | |
" <td>Applying the Delta method in metric analytics:...</td>\n", | |
" <td>Alex Deng (Microsoft); Ulf Knoblich (Microsoft...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>39</th>\n", | |
" <td>A-1415</td>\n", | |
" <td>Where Will Dockless Shared Bikes be Stacked?—-...</td>\n", | |
" <td>Zhaoyang Liu (Shanghai Jiao Tong University); ...</td>\n", | |
" <td>Application - oral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"35 A-1158 ActiveRemediation: The Search for Lead Pipes i... \n", | |
"36 A-1253 Winner’s Curse: Bias Estimation for Total Effe... \n", | |
"37 A-1326 FireGuru: A real-time pipeline for spatio-temp... \n", | |
"38 A-1411 Applying the Delta method in metric analytics:... \n", | |
"39 A-1415 Where Will Dockless Shared Bikes be Stacked?—-... \n", | |
"\n", | |
" Authors Track \n", | |
"35 Jacob Abernethy (Georgia Institute of Technolo... Application - oral \n", | |
"36 Minyong Lee (Airbnb); Milan Shen (Airbnb) Application - oral \n", | |
"37 Bhavkaran Swalia (Carnegie Mellon University);... Application - oral \n", | |
"38 Alex Deng (Microsoft); Ulf Knoblich (Microsoft... Application - oral \n", | |
"39 Zhaoyang Liu (Shanghai Jiao Tong University); ... Application - oral " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"app_oral_df.tail()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Application - poster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A-1001</td>\n", | |
" <td>Towards Knowledge Discovery from the Vatican S...</td>\n", | |
" <td>Donatella Firmani (Roma Tre University); Marco...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>A-1001</td>\n", | |
" <td>Towards Knowledge Discovery from the Vatican S...</td>\n", | |
" <td>Donatella Firmani (Roma Tre University); Marco...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>A-14</td>\n", | |
" <td>Visual Search at Alibaba</td>\n", | |
" <td>Yanhao Zhang (iDST, Alibaba Group); Pan Pan (i...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A-53</td>\n", | |
" <td>Deep Distributed Fusion Network for Air Qualit...</td>\n", | |
" <td>Xiuwen Yi (Southwest Jiaotong University); Jun...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>A-65</td>\n", | |
" <td>Deep Interest Network for Click-Through Rate P...</td>\n", | |
" <td>Guorui Zhou (Alibaba-inc); Xiaoqiang Zhu (Alib...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"0 A-1001 Towards Knowledge Discovery from the Vatican S... \n", | |
"1 A-1001 Towards Knowledge Discovery from the Vatican S... \n", | |
"2 A-14 Visual Search at Alibaba \n", | |
"3 A-53 Deep Distributed Fusion Network for Air Qualit... \n", | |
"4 A-65 Deep Interest Network for Click-Through Rate P... \n", | |
"\n", | |
" Authors Track \n", | |
"0 Donatella Firmani (Roma Tre University); Marco... Application - poster \n", | |
"1 Donatella Firmani (Roma Tre University); Marco... Application - poster \n", | |
"2 Yanhao Zhang (iDST, Alibaba Group); Pan Pan (i... Application - poster \n", | |
"3 Xiuwen Yi (Southwest Jiaotong University); Jun... Application - poster \n", | |
"4 Guorui Zhou (Alibaba-inc); Xiaoqiang Zhu (Alib... Application - poster " | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"app_poster_xpath = '/html/body/main/div[1]/section/div[1]/div/div[4]/div/ul'\n", | |
"\n", | |
"app_poster_list = tree.xpath(app_poster_xpath)[0].xpath('li/div/span/text()')\n", | |
"\n", | |
"app_poster_df = pd.DataFrame(np.array(app_poster_list)\n", | |
" .reshape(-1, 3))\n", | |
"app_poster_df = pd.concat((app_poster_df[0].str.split('^(A-\\d*):', \n", | |
" expand=True)[[1,2]],\n", | |
" app_poster_df[2]),\n", | |
" axis=1)\n", | |
"app_poster_df.columns = ['ID', 'Title', 'Authors']\n", | |
"\n", | |
"app_poster_df['ID'] = app_poster_df.ID.str.strip()\n", | |
"app_poster_df['Title'] = app_poster_df.Title.str.strip()\n", | |
"app_poster_df['Authors'] = app_poster_df.Authors.str.strip()\n", | |
"\n", | |
"app_poster_df['Track'] = \"Application - poster\"\n", | |
"app_poster_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>70</th>\n", | |
" <td>A-1459</td>\n", | |
" <td>Alchemist: Accelerating Large-Scale Data Analy...</td>\n", | |
" <td>Alex Gittens (Rensselaer Polytechnic Institute...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>71</th>\n", | |
" <td>A-1513</td>\n", | |
" <td>MIX: Multi-Channel Information Crossing for Te...</td>\n", | |
" <td>Haolan Chen (Mobile Internet Group, Tencent); ...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>72</th>\n", | |
" <td>A-1535</td>\n", | |
" <td>Discovering latent patterns of urban cultural ...</td>\n", | |
" <td>Xiao Zhou (University of Cambridge); Anastasio...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>73</th>\n", | |
" <td>A-1573</td>\n", | |
" <td>Learning Tasks for Multitask Learning: Heterog...</td>\n", | |
" <td>Harini Suresh (Massachusetts Institute of Tech...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>74</th>\n", | |
" <td>A-1592</td>\n", | |
" <td>StepDeep: A Novel Spatial-temporal Mobility Ev...</td>\n", | |
" <td>Bilong Shen (Tsinghua University); Xiaodan Lia...</td>\n", | |
" <td>Application - poster</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"70 A-1459 Alchemist: Accelerating Large-Scale Data Analy... \n", | |
"71 A-1513 MIX: Multi-Channel Information Crossing for Te... \n", | |
"72 A-1535 Discovering latent patterns of urban cultural ... \n", | |
"73 A-1573 Learning Tasks for Multitask Learning: Heterog... \n", | |
"74 A-1592 StepDeep: A Novel Spatial-temporal Mobility Ev... \n", | |
"\n", | |
" Authors Track \n", | |
"70 Alex Gittens (Rensselaer Polytechnic Institute... Application - poster \n", | |
"71 Haolan Chen (Mobile Internet Group, Tencent); ... Application - poster \n", | |
"72 Xiao Zhou (University of Cambridge); Anastasio... Application - poster \n", | |
"73 Harini Suresh (Massachusetts Institute of Tech... Application - poster \n", | |
"74 Bilong Shen (Tsinghua University); Xiaodan Lia... Application - poster " | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"app_poster_df.tail()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Merging" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ID</th>\n", | |
" <th>Title</th>\n", | |
" <th>Authors</th>\n", | |
" <th>Track</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>133</td>\n", | |
" <td>R2SDH: Robust Rotated Supervised Discrete Hashing</td>\n", | |
" <td>Jie Gui (Rutgers University); Ping Li (Baidu R...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>22</td>\n", | |
" <td>Smoothed Dilated Convolutions for Improved Den...</td>\n", | |
" <td>Zhengyang Wang (Washington State University); ...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>27</td>\n", | |
" <td>Discovering Non-Redundant K-means Clusterings ...</td>\n", | |
" <td>Dominik Mautz (Ludwig Maximilian University of...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>44</td>\n", | |
" <td>Trajectory-driven Influential Billboard Placement</td>\n", | |
" <td>Ping Zhang (Wuhan University); Zhifeng Bao (RM...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>83</td>\n", | |
" <td>Multi-Type Itemset Embedding for Learning Beha...</td>\n", | |
" <td>Daheng Wang (University of Notre Dame); Meng J...</td>\n", | |
" <td>Research - Oral</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ID Title \\\n", | |
"0 133 R2SDH: Robust Rotated Supervised Discrete Hashing \n", | |
"1 22 Smoothed Dilated Convolutions for Improved Den... \n", | |
"2 27 Discovering Non-Redundant K-means Clusterings ... \n", | |
"3 44 Trajectory-driven Influential Billboard Placement \n", | |
"4 83 Multi-Type Itemset Embedding for Learning Beha... \n", | |
"\n", | |
" Authors Track \n", | |
"0 Jie Gui (Rutgers University); Ping Li (Baidu R... Research - Oral \n", | |
"1 Zhengyang Wang (Washington State University); ... Research - Oral \n", | |
"2 Dominik Mautz (Ludwig Maximilian University of... Research - Oral \n", | |
"3 Ping Zhang (Wuhan University); Zhifeng Bao (RM... Research - Oral \n", | |
"4 Daheng Wang (University of Notre Dame); Meng J... Research - Oral " | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.concat((res_oral_df,\n", | |
" res_poster_df,\n", | |
" app_oral_df,\n", | |
" app_poster_df),\n", | |
" ignore_index=True)\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"assert df[df.isnull().any(axis=1)].empty" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv('./KDD_Papers.csv', index=False, encoding='utf-8')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.14" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment