failable/gist:ea8008b52901ab74da6f860002403270

## gistfile1.txt
@inproceedings{turian-2010-word-repres,
  title        = "Word representations: a simple and general method for
                  semi-supervised learning",
  author       = "Turian, Joseph and Ratinov, Lev and Bengio, Yoshua",
  booktitle    = "Proceedings of the 48th annual meeting of the association for
                  computational linguistics",
  abstract     = "If we take an existing supervised NLP sys- tem, a simple and
                  general way to improve accuracy is to use unsupervised word
                  representations as extra word features. We evaluate Brown
                  clusters, Collobert and Weston (2008) embeddings, and HLBL
                  (Mnih \& Hinton, 2009) embeddings of words on both NER and
                  chunking. We use near state-of-the-art supervised baselines,
                  and find that each of the three word representations improves
                  the accu- racy of these baselines. We find further
                  improvements by combining diðerent word representations. You
                  can download our word features, for oð-the-shelf use in
                  existing NLP systems, as well as our code, here:
                  \url{http://metaoptimize.com/projects/wordreprs}",
  pages        = "384-394",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@inproceedings{levy-2014-depend-based,
  title        = "Dependency-Based Word Embeddings.",
  author       = "Levy, Omer and Goldberg, Yoav",
  booktitle    = "ACL (2)",
  pages        = "302-308",
  year         = 2014
}

@article{bengio-2008-neural-net,
  title        = "Neural net language models",
  author       = "Bengio, Yoshua",
  journal      = "Scholarpedia",
  volume       = 3,
  number       = 1,
  pages        = 3881,
  year         = 2008
}

@article{sahlgren-2006-the-word-space-model,
  title        = {The Word-Space Model: Using distributional analysis to
                  represent syntagmatic and paradigmatic relations between words
                  in high-dimensional vector spaces},
  author       = {Sahlgren, Magnus},
  year         = {2006},
  publisher    = {Institutionen for lingvistik}
}

@inproceedings{pereira-1993-dist-cluster,
  title        = "Distributional clustering of English words",
  author       = "Pereira, Fernando and Tishby, Naftali and Lee, Lillian",
  booktitle    = "Proceedings of the 31st annual meeting on Association for
                  Computational Linguistics",
  pages        = "183-190",
  year         = 1993,
  organization = "Association for Computational Linguistics"
}

@article{freund-1999-short,
  title        = "A short introduction to boosting",
  author       = "Freund, Yoav and Schapire, Robert and Abe, N",
  journal      = "Journal-Japanese Society For Artificial Intelligence",
  volume       = 14,
  number       = "771-780",
  pages        = 1612,
  year         = 1999,
  publisher    = "JAPANESE SOC ARTIFICIAL INTELL"
}

@inproceedings{caruana-2006-empirical-compari,
  title        = "An empirical comparison of supervised learning algorithms",
  author       = "Caruana, Rich and Niculescu-Mizil, Alexandru",
  booktitle    = "Proceedings of the 23rd international conference on Machine
                  learning",
  pages        = "161-168",
  year         = 2006,
  organization = "ACM"
}

@article{natekin-2013-gradient-boosting,
  title        = "Gradient boosting machines, a tutorial",
  author       = "Natekin, Alexey and Knoll, Alois",
  journal      = "Frontiers in neurorobotics",
  volume       = 7,
  year         = 2013,
  publisher    = "Frontiers Media SA",
  url          = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3885826/"
}

@article{loh-2011-classification-regression,
  title        = "Classification and regression trees",
  author       = "Loh, Wei-Yin",
  journal      = "Wiley Interdisciplinary Reviews: Data Mining and Knowledge
                  Discovery",
  volume       = 1,
  number       = 1,
  pages        = "14-23",
  year         = 2011,
  publisher    = "Wiley Online Library"
}

@inproceedings{chen-2015-higgs-boson,
  title        = "Higgs boson discovery with boosted trees",
  author       = "Chen, Tianqi and He, Tong",
  booktitle    = "Cowan et al., editor, JMLR: Workshop and Conference
                  Proceedings",
  number       = 42,
  pages        = "69-80",
  year         = 2015
}

@inproceedings{gutmann-2010-noise-contra-estima,
  title        = "Noise-contrastive estimation: A new estimation principle for
                  unnormalized statistical models.",
  author       = "Gutmann, Michael and Hyv{\"a}rinen, Aapo",
  booktitle    = "AISTATS",
  volume       = 1,
  number       = 2,
  pages        = 6,
  year         = 2010
}

@phdthesis{sutskever-2013-training-recurrent,
  title        = "Training recurrent neural networks",
  author       = "Sutskever, Ilya",
  year         = 2013,
  school       = "University of Toronto"
}

@inproceedings{szegedy-2015-going-deeper,
  title        = "Going deeper with convolutions",
  author       = "Szegedy, Christian and Liu, Wei and Jia, Yangqing and
                  Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and
                  Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew",
  booktitle    = "Proceedings of the IEEE Conference on Computer Vision and
                  Pattern Recognition",
  pages        = "1-9",
  year         = 2015
}

@inproceedings{krizhevsky-2012-imagenet-classification,
  title        = "Imagenet classification with deep convolutional neural
                  networks",
  author       = "Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1097-1105",
  year         = 2012
}

@article{he-2009-learning-from-imbalanced,
  title        = "Learning from imbalanced data",
  author       = "He, Haibo and Garcia, Edwardo A",
  journal      = "IEEE Transactions on knowledge and data engineering",
  volume       = 21,
  number       = 9,
  pages        = "1263-1284",
  year         = 2009,
  publisher    = "IEEE"
}

@article{zaremba-2015-empir-explor,
  author       = {Zaremba, Wojciech},
  title        = {An Empirical Exploration of Recurrent Network Architectures},
  year         = {2015},
}

@article{friedman-2001-greedy-func-approx,
  title        = "Greedy function approximation: a gradient boosting machine",
  author       = "Friedman, Jerome H",
  journal      = "Annals of statistics",
  pages        = "1189-1232",
  year         = 2001,
  publisher    = "JSTOR"
}

@article{friedman-2002-stochastic-gradient-boost,
  title        = "Stochastic gradient boosting",
  author       = "Friedman, Jerome H",
  journal      = "Computational Statistics \& Data Analysis",
  volume       = 38,
  number       = 4,
  pages        = "367-378",
  year         = 2002,
  publisher    = "Elsevier"
}

@article{friedman-2000-additive-logistic-regression,
  title        = "Additive logistic regression: a statistical view of boosting
                  (with discussion and a rejoinder by the authors)",
  author       = "Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert
                  and others",
  journal      = "The annals of statistics",
  volume       = 28,
  number       = 2,
  pages        = "337-407",
  year         = 2000,
  publisher    = "Institute of Mathematical Statistics"
}

@inproceedings{greenwald-2001-space-efficient-online,
  title        = "Space-efficient online computation of quantile summaries",
  author       = "Greenwald, Michael and Khanna, Sanjeev",
  booktitle    = "ACM SIGMOD Record",
  volume       = 30,
  number       = 2,
  pages        = "58-66",
  year         = 2001,
  organization = "ACM"
}

@inproceedings{zhang-2007-fast-algorithm,
  title        = "A fast algorithm for approximate quantiles in high speed data
                  streams",
  author       = "Zhang, Qi and Wang, Wei",
  booktitle    = "Scientific and Statistical Database Management,
                  2007. SSBDM'07. 19th International Conference on",
  pages        = "29-29",
  year         = 2007,
  organization = "IEEE"
}

@article{greenwald-2016-quant-equid,
  author       = "Greenwald, Michael B and Khanna, Sanjeev",
  title        = "Quantiles and Equidepth Histograms Over Streams",
  journal      = "In Data Stream Management: Processing High-Speed Data
                  Streams. Springer",
  year         = 2016,
  publisher    = "Citeseer"
}

@ARTICLE{goldberg-2014-explain,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1402.3722G",
  archivePrefix= "arXiv",
  author       = "{Goldberg}, Y. and {Levy}, O.",
  eprint       = "1402.3722",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.CL",
  title        = "{word2vec Explained: Deriving Mikolov Et Al.'s
                  Negative-Sampling Word-Embedding method}",
  year         = 2014
}

@ARTICLE{turney-2010-from-frequen-to-meanin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2010arXiv1003.1141T",
  archivePrefix= "arXiv",
  author       = "{Turney}, P.~D. and {Pantel}, P.",
  eprint       = "1003.1141",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Information Retrieval, Computer Science - Learning, H.3.1, I.2.6, I.2.7",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{From Frequency To Meaning: Vector Space Models of
  Semantics}",
  year         = 2010
}

@ARTICLE{zaremba-2014-recur-neural-networ-regul,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1409.2329Z",
  archivePrefix= "arXiv",
  author       = "{Zaremba}, W. and {Sutskever}, I. and {Vinyals}, O.",
  eprint       = "1409.2329",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing",
  month        = sep,
  title        = "{Recurrent Neural Network Regularization}",
  year         = 2014
}

@ARTICLE{cho-2014-encoder-decoder,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1406.1078C",
  archivePrefix= "arXiv",
  author       = "{Cho}, K. and {van Merrienboer}, B. and {Gulcehre}, C. and
                  {Bahdanau}, D. and {Bougares}, F. and {Schwenk}, H. and
                  {Bengio}, Y.",
  eprint       = "1406.1078",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing, Statistics
  - Machine Learning",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Learning Phrase Representations Using Rnn Encoder-Decoder
                  for Statistical Machine Translation}",
  year         = 2014
}

@ARTICLE{sutskever-2014-seq2seq,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1409.3215S",
  archivePrefix= "arXiv",
  author       = "{Sutskever}, I. and {Vinyals}, O. and {Le}, Q.~V.",
  eprint       = "1409.3215",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = sep,
  primaryClass = "cs.CL",
  title        = "{Sequence To Sequence Learning With Neural Networks}",
  year         = 2014
}

@ARTICLE{bengio-2015-schedule-sampling,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150603099B",
  archivePrefix= "arXiv",
  author       = "{Bengio}, S. and {Vinyals}, O. and {Jaitly}, N. and
                  {Shazeer}, N.",
  eprint       = "1506.03099",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Computer Science - Computer Vision and Pattern
                  Recognition",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{Scheduled Sampling for Sequence Prediction With Recurrent
                  Neural Networks}",
  year         = 2015
}

@ARTICLE{jean-2014-using-very,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.2007J",
  archivePrefix= "arXiv",
  author       = "{Jean}, S. and {Cho}, K. and {Memisevic}, R. and {Bengio},
  Y.",
  eprint       = "1412.2007",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = dec,
  primaryClass = "cs.CL",
  title        = "{On Using Very Large Target Vocabulary for Neural Machine
                  Translation}",
  year         = 2014
}

@ARTICLE{chen-2016-xgboos,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160302754C",
  archivePrefix= "arXiv",
  author       = "{Chen}, T. and {Guestrin}, C.",
  eprint       = "1603.02754",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = mar,
  primaryClass = "cs.LG",
  title        = "{XGBoost: A Scalable Tree Boosting System}",
  year         = 2016
}

@ARTICLE{kawaguchi-2016-deep-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160507110K",
  archivePrefix= "arXiv",
  author       = "{Kawaguchi}, K.",
  eprint       = "1605.07110",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning,
                  Mathematics - Optimization and Control",
  month        = may,
  primaryClass = "stat.ML",
  title        = "{Deep Learning Without Poor Local Minima}",
  year         = 2016
}

@ARTICLE{ruder-2016-overv-gradien,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160904747R",
  archivePrefix= "arXiv",
  author       = "{Ruder}, S.",
  eprint       = "1609.04747",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = sep,
  primaryClass = "cs.LG",
  title        = "{An Overview of Gradient Descent Optimization algorithms}",
  year         = 2016
}

@ARTICLE{zeiler-2012-adadel,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1212.5701Z",
  archivePrefix= "arXiv",
  author       = "{Zeiler}, M.~D.",
  eprint       = "1212.5701",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{ADADELTA: An Adaptive Learning Rate Method}",
  year         = 2012
}

@ARTICLE{bengio-2012-advan-optim-recur-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1212.0901B",
  archivePrefix= "arXiv",
  author       = "{Bengio}, Y. and {Boulanger-Lewandowski}, N. and {Pascanu},
                  R.",
  eprint       = "1212.0901",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{Advances in Optimizing Recurrent Networks}",
  year         = 2012
}

@ARTICLE{he-2015-deep-resid,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151203385H",
  archivePrefix= "arXiv",
  author       = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
  eprint       = "1512.03385",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = dec,
  primaryClass = "cs.CV",
  title        = "{Deep Residual Learning for Image Recognition}",
  year         = 2015
}

@ARTICLE{simonyan-2014-very-deep,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1409.1556S",
  archivePrefix= "arXiv",
  author       = "{Simonyan}, K. and {Zisserman}, A.",
  eprint       = "1409.1556",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = sep,
  primaryClass = "cs.CV",
  title        = "{Very Deep Convolutional Networks for Large-Scale Image
                  Recognition}",
  year         = 2014
}

@ARTICLE{lin-2013-networ-in-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.4400L",
  archivePrefix= "arXiv",
  author       = "{Lin}, M. and {Chen}, Q. and {Yan}, S.",
  eprint       = "1312.4400",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning",
  month        = dec,
  title        = "{Network In Network}",
  year         = 2013
}

@ARTICLE{montufar-2014-number-linear,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1402.1869M",
  archivePrefix= "arXiv",
  author       = "{Mont{\'u}far}, G. and {Pascanu}, R. and {Cho}, K. and
                  {Bengio}, Y.",
  eprint       = "1402.1869",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning,
                  Computer Science - Neural and Evolutionary Computing",
  month        = feb,
  primaryClass = "stat.ML",
  title        = "{On the Number of Linear Regions of Deep Neural Networks}",
  year         = 2014
}

@ARTICLE{luxburg-2010-clust-stabil,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2010arXiv1007.1075V",
  archivePrefix= "arXiv",
  author       = "{von Luxburg}, U.",
  eprint       = "1007.1075",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning",
  month        = jul,
  primaryClass = "stat.ML",
  title        = "{Clustering Stability: An Overview}",
  year         = 2010
}

@ARTICLE{shah-2014-bayes-regres-bitcoin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.1231S",
  archivePrefix= "arXiv",
  author       = "{Shah}, D. and {Zhang}, K.",
  eprint       = "1410.1231",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Artificial Intelligence, Mathematics -
                  Statistics Theory",
  month        = oct,
  primaryClass = "cs.AI",
  title        = "{Bayesian Regression and Bitcoin}",
  year         = 2014
}

@article{domingos-2012-few-useful-things,
  title        = "A few useful things to know about machine learning",
  author       = "Domingos, Pedro",
  journal      = "Communications of the ACM",
  volume       = 55,
  number       = 10,
  pages        = "78-87",
  year         = 2012,
  publisher    = "ACM"
}

@ARTICLE{thakur-2015-autoc,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150702188T",
  archivePrefix= "arXiv",
  author       = "{Thakur}, A. and {Krohn-Grimberghe}, A.",
  eprint       = "1507.02188",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = jul,
  primaryClass = "stat.ML",
  title        = "{AutoCompete: A Framework for Machine Learning Competition}",
  year         = 2015
}

@article{huang-1998-k-modes,
  title        = "Extensions to the k-means algorithm for clustering large data
                  sets with categorical values",
  author       = "Huang, Zhexue",
  journal      = "Data mining and knowledge discovery",
  volume       = 2,
  number       = 3,
  pages        = "283-304",
  year         = 1998,
  publisher    = "Springer"
}

@inproceedings{he-2006-approximation-algorithms,
  title        = "Approximation algorithms for k-modes clustering",
  author       = "He, Zengyou and Deng, Shengchun and Xu, Xiaofei",
  booktitle    = "International Conference on Intelligent Computing",
  pages        = "296-302",
  year         = 2006,
  organization = "Springer"
}

@inproceedings{plant-2011-inconco-interp-cluster,
  title        = "Inconco: interpretable clustering of numerical and
                  categorical objects",
  author       = "Plant, Claudia and B{\"o}hm, Christian",
  booktitle    = "Proceedings of the 17th ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  pages        = "1127-1135",
  year         = 2011,
  organization = "ACM"
}

@article{kim-2004-fuzzy-cluster,
  title        = "Fuzzy clustering of categorical data using fuzzy centroids",
  author       = "Kim, Dae-Won and Lee, Kwang H and Lee, Doheon",
  journal      = "Pattern Recognition Letters",
  volume       = 25,
  number       = 11,
  pages        = "1263-1271",
  year         = 2004,
  publisher    = "Elsevier"
}

@article{guha-2000-rock-robust-cluster,
  title        = "ROCK: A robust clustering algorithm for categorical
                  attributes",
  author       = "Guha, Sudipto and Rastogi, Rajeev and Shim, Kyuseok",
  journal      = "Information systems",
  volume       = 25,
  number       = 5,
  pages        = "345-366",
  year         = 2000,
  publisher    = "Elsevier"
}

@inproceedings{louppe-2013-understanding-variable-import,
  title        = "Understanding variable importances in forests of randomized
                  trees",
  author       = "Louppe, Gilles and Wehenkel, Louis and Sutera, Antonio and
                  Geurts, Pierre",
  booktitle    = "Advances in neural information processing systems",
  pages        = "431-439",
  year         = 2013
}

@article{gelman-2008-scaling-regress-inputs,
  title        = "Scaling regression inputs by dividing by two standard
                  deviations",
  author       = "Gelman, Andrew",
  journal      = "Statistics in medicine",
  volume       = 27,
  number       = 15,
  pages        = "2865-2873",
  year         = 2008,
  publisher    = "Wiley Online Library"
}

@article{reshef-2011-detecting-novel-assoc,
  title        = "Detecting novel associations in large data sets",
  author       = "Reshef, David N and Reshef, Yakir A and Finucane, Hilary K
                  and Grossman, Sharon R and McVean, Gilean and Turnbaugh,
                  Peter J and Lander, Eric S and Mitzenmacher, Michael and
                  Sabeti, Pardis C",
  journal      = "science",
  volume       = 334,
  number       = 6062,
  pages        = "1518-1524",
  year         = 2011,
  publisher    = "American Association for the Advancement of Science"
}

@article{cawley-2010-over-fitting,
  title        = "On over-fitting in model selection and subsequent selection
                  bias in performance evaluation",
  author       = "Cawley, Gavin C and Talbot, Nicola LC",
  journal      = "Journal of Machine Learning Research",
  volume       = 11,
  number       = "Jul",
  pages        = "2079-2107",
  year         = 2010
}

@article{varma-2006-bias-error-estim,
  title        = "Bias in error estimation when using cross-validation for
                  model selection",
  author       = "Varma, Sudhir and Simon, Richard",
  journal      = "BMC bioinformatics",
  volume       = 7,
  number       = 1,
  pages        = 91,
  year         = 2006,
  publisher    = "BioMed Central"
}

@ARTICLE{heaton-2016-deep-learn-finan,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160206561H",
  archivePrefix= "arXiv",
  author       = "{Heaton}, J.~B. and {Polson}, N.~G. and {Witte}, J.~H.",
  eprint       = "1602.06561",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Deep Learning in Finance}",
  year         = 2016
}

@ARTICLE{sirignano-2016-deep-learn-mortg-risk,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160702470S",
  archivePrefix= "arXiv",
  author       = "{Sirignano}, J. and {Sadhwani}, A. and {Giesecke}, K.",
  eprint       = "1607.02470",
  journal      = "ArXiv e-prints",
  keywords     = "Quantitative Finance - Statistical Finance",
  month        = jul,
  primaryClass = "q-fin.ST",
  title        = "{Deep Learning for Mortgage Risk}",
  year         = 2016
}

@article{heaton-2016-deep-learning-finance,
  title        = "Deep learning for finance: deep portfolios",
  author       = "Heaton, JB and Polson, NG and Witte, Jan Hendrik",
  journal      = "Applied Stochastic Models in Business and Industry",
  year         = 2016,
  publisher    = "Wiley Online Library"
}

@ARTICLE{dixon-2016-class-based,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160308604D",
  archivePrefix= "arXiv",
  author       = "{Dixon}, M. and {Klabjan}, D. and {Bang}, J.~H.",
  eprint       = "1603.08604",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computational
                  Engineering, Finance, and Science",
  month        = mar,
  primaryClass = "cs.LG",
  title        = "{Classiffication-Based Financial Markets Prediction Using
                  Deep Neural Networks}",
  year         = 2016
}

@article{langkvist-2014-review-unsuper-feature,
  title        = "A review of unsupervised feature learning and deep learning
                  for time-series modeling",
  author       = "L{\"a}ngkvist, Martin and Karlsson, Lars and Loutfi, Amy",
  journal      = "Pattern Recognition Letters",
  volume       = 42,
  pages        = "11-24",
  year         = 2014,
  publisher    = "Elsevier"
}

@article{qiu-2016-predicting-direction,
  title        = "Predicting the Direction of Stock Market Index Movement Using
                  an Optimized Artificial Neural Network Model",
  author       = "Qiu, Mingyue and Song, Yu",
  journal      = "PLoS One",
  volume       = 11,
  number       = 5,
  year         = 2016,
  publisher    = "Public Library of Science"
}

@inproceedings{yang-2016-ensemble-model-stock,
  title        = "Ensemble Model for Stock Price Movement Trend Prediction on
                  Different Investing Periods",
  author       = "Yang, Jian and Rao, Ruonan and Hong, Pei and Ding, Peng",
  booktitle    = "2016 12th International Conference on Computational
                  Intelligence and Security (CIS)",
  pages        = "358-361",
  year         = 2016,
  organization = "IEEE"
}

@article{lecun-2015-deep-learning,
  title        = "Deep learning",
  author       = "LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey",
  journal      = "Nature",
  volume       = 521,
  number       = 7553,
  pages        = "436-444",
  year         = 2015,
  publisher    = "Nature Research"
}

@ARTICLE{bollen-2010-twitt-mood,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2010arXiv1010.3003B",
  archivePrefix= "arXiv",
  author       = "{Bollen}, J. and {Mao}, H. and {Zeng}, X.-J.",
  eprint       = "1010.3003",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computational Engineering, Finance, and
                  Science, Computer Science - Computation and Language,
                  Computer Science - Social and Information Networks, Physics -
                  Physics and Society",
  month        = oct,
  primaryClass = "cs.CE",
  title        = "{Twitter Mood Predicts the Stock market}",
  year         = 2010
}

@ARTICLE{goerg-2012-forec-compon-analy-forec,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1205.4591G",
  archivePrefix= "arXiv",
  author       = "{Goerg}, G.~M.",
  eprint       = "1205.4591",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Methodology, Statistics - Machine Learning",
  month        = may,
  primaryClass = "stat.ME",
  title        = "{Forecastable Component Analysis (ForeCA)}",
  year         = 2012
}

@ARTICLE{fehrer-2015-improv-decis,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150801993F",
  archivePrefix= "arXiv",
  author       = "{Fehrer}, R. and {Feuerriegel}, S.",
  eprint       = "1508.01993",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Computation
                  and Language, Computer Science - Learning",
  month        = aug,
  primaryClass = "stat.ML",
  title        = "{Improving Decision Analytics With Deep Learning: The Case of
                  Financial Disclosures}",
  year         = 2015
}

@article{kaastra-1996-designing-neural-net,
  title        = "Designing a neural network for forecasting financial and
                  economic time series",
  author       = "Kaastra, Iebeling and Boyd, Milton",
  journal      = "Neurocomputing",
  volume       = 10,
  number       = 3,
  pages        = "215-236",
  year         = 1996,
  publisher    = "Elsevier"
}

@article{ahmed-2010-empirical-comparison,
  title        = "An empirical comparison of machine learning models for time
                  series forecasting",
  author       = "Ahmed, Nesreen K and Atiya, Amir F and Gayar, Neamat El and
                  El-Shishiny, Hisham",
  journal      = "Econometric Reviews",
  volume       = 29,
  number       = "5-6",
  pages        = "594-621",
  year         = 2010,
  publisher    = "Taylor \& Francis"
}

@article{dubovikov-2004-dimension-minimal-cover,
  title        = "Dimension of the minimal cover and fractal analysis of time
                  series",
  author       = "Dubovikov, MM and Starchenko, NV and Dubovikov, MS",
  journal      = "Physica A: Statistical Mechanics and its Applications",
  volume       = 339,
  number       = 3,
  pages        = "591-608",
  year         = 2004,
  publisher    = "Elsevier"
}

@inproceedings{dalto-2015-deep-neural-net,
  title        = "Deep neural networks for ultra-short-term wind forecasting",
  author       = "Dalto, Mladen and Matu{\v{s}}ko, Jadranko and Va{\v{s}}ak,
                  Mario",
  booktitle    = "Industrial Technology (ICIT), 2015 IEEE International
                  Conference on",
  pages        = "1657-1663",
  year         = 2015,
  organization = "IEEE"
}

@inproceedings{goodfellow-2014-gan,
  title        = "Generative adversarial nets",
  author       = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and
                  Xu, Bing and Warde-Farley, David and Ozair, Sherjil and
                  Courville, Aaron and Bengio, Yoshua",
  booktitle    = "Advances in neural information processing systems",
  pages        = "2672-2680",
  year         = 2014
}

@ARTICLE{goodfellow-2014-explain-harnes-adver-examp,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.6572G",
  archivePrefix= "arXiv",
  author       = "{Goodfellow}, I.~J. and {Shlens}, J. and {Szegedy}, C.",
  eprint       = "1412.6572",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = dec,
  primaryClass = "stat.ML",
  title        = "{Explaining and Harnessing Adversarial Examples}",
  year         = 2014
}

@inproceedings{denton-2015-deep-generative-image,
  title        = "Deep Generative Image Models using a Laplacian Pyramid of
                  Adversarial Networks",
  author       = "Denton, Emily L and Chintala, Soumith and Fergus, Rob and
                  others",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1486-1494",
  year         = 2015
}

@ARTICLE{radford-2015-dcgan,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106434R",
  archivePrefix= "arXiv",
  author       = "{Radford}, A. and {Metz}, L. and {Chintala}, S.",
  eprint       = "1511.06434",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computer
                  Vision and Pattern Recognition",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Unsupervised Representation Learning With Deep Convolutional
                  Generative Adversarial Networks}",
  year         = 2015
}

@inproceedings{dosovitskiy-2015-learning-to-generate,
  title        = "Learning to generate chairs with convolutional neural
                  networks",
  author       = "Dosovitskiy, Alexey and Tobias Springenberg, Jost and Brox,
                  Thomas",
  booktitle    = "Proceedings of the IEEE Conference on Computer Vision and
                  Pattern Recognition",
  pages        = "1538-1546",
  year         = 2015
}

@ARTICLE{burda-2015-impor-weigh-autoen,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150900519B",
  archivePrefix= "arXiv",
  author       = "{Burda}, Y. and {Grosse}, R. and {Salakhutdinov}, R.",
  eprint       = "1509.00519",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Statistics - Machine Learning",
  month        = sep,
  primaryClass = "cs.LG",
  title        = "{Importance Weighted Autoencoders}",
  year         = 2015
}

@ARTICLE{ganin-2014-unsup-domain,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1409.7495G",
  archivePrefix= "arXiv",
  author       = "{Ganin}, Y. and {Lempitsky}, V.",
  eprint       = "1409.7495",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning,
                  Computer Science - Neural and Evolutionary Computing",
  month        = sep,
  primaryClass = "stat.ML",
  title        = "{Unsupervised Domain Adaptation By Backpropagation}",
  year         = 2014
}

@ARTICLE{makhzani-2015-adver-autoen,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151105644M",
  archivePrefix= "arXiv",
  author       = "{Makhzani}, A. and {Shlens}, J. and {Jaitly}, N. and
                  {Goodfellow}, I. and {Frey}, B.",
  eprint       = "1511.05644",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Adversarial Autoencoders}",
  year         = 2015
}

@ARTICLE{szegedy-2013-intrig-proper-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.6199S",
  archivePrefix= "arXiv",
  author       = "{Szegedy}, C. and {Zaremba}, W. and {Sutskever}, I. and
                  {Bruna}, J. and {Erhan}, D. and {Goodfellow}, I. and
                  {Fergus}, R.",
  eprint       = "1312.6199",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = dec,
  primaryClass = "cs.CV",
  title        = "{Intriguing Properties of Neural networks}",
  year         = 2013
}

@ARTICLE{kurakin-2016-adver-examp-physic,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160702533K",
  archivePrefix= "arXiv",
  author       = "{Kurakin}, A. and {Goodfellow}, I. and {Bengio}, S.",
  eprint       = "1607.02533",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Cryptography and Security, Computer
                  Science - Learning, Statistics - Machine Learning",
  month        = jul,
  primaryClass = "cs.CV",
  title        = "{Adversarial Examples in the Physical world}",
  year         = 2016
}

@ARTICLE{mirza-2014-condit-gener-adver-nets,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1411.1784M",
  archivePrefix= "arXiv",
  author       = "{Mirza}, M. and {Osindero}, S.",
  eprint       = "1411.1784",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Computer Vision and Pattern
                  Recognition, Statistics - Machine Learning",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Conditional Generative Adversarial Nets}",
  year         = 2014
}

@ARTICLE{goodfellow-2017-nips-tutor,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170100160G",
  archivePrefix= "arXiv",
  author       = "{Goodfellow}, I.",
  eprint       = "1701.00160",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{NIPS 2016 Tutorial: Generative Adversarial Networks}",
  year         = 2017
}

@ARTICLE{arjovsky-2017-wasser-gan,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170107875A",
  archivePrefix= "arXiv",
  author       = "{Arjovsky}, M. and {Chintala}, S. and {Bottou}, L.",
  eprint       = "1701.07875",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = jan,
  primaryClass = "stat.ML",
  title        = "{Wasserstein GAN}",
  year         = 2017
}

@inproceedings{ng-2000-algorithms-inverse,
  title        = "Algorithms for Inverse Reinforcement Learning",
  author       = "Ng, Andrew Y and Russell, Stuart",
  booktitle    = "in Proc. 17th International Conf. on Machine Learning",
  year         = 2000
}

@ARTICLE{mnih-2013-playin-atari,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.5602M",
  archivePrefix= "arXiv",
  author       = "{Mnih}, V. and {Kavukcuoglu}, K. and {Silver}, D. and
                  {Graves}, A. and {Antonoglou}, I. and {Wierstra}, D. and
                  {Riedmiller}, M.",
  eprint       = "1312.5602",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{Playing Atari With Deep Reinforcement Learning}",
  year         = 2013
}

@ARTICLE{heaton-2016-deep-portf-theor,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160507230H",
  archivePrefix= "arXiv",
  author       = "{Heaton}, J.~B. and {Polson}, N.~G. and {Witte}, J.~H.",
  eprint       = "1605.07230",
  journal      = "ArXiv e-prints",
  keywords     = "Quantitative Finance - Portfolio Management, Computer Science
  - Learning",
  month        = may,
  primaryClass = "q-fin.PM",
  title        = "{Deep Portfolio Theory}",
  year         = 2016
}

@ARTICLE{karpathy-2015-visual-under-recur-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150602078K",
  archivePrefix= "arXiv",
  author       = "{Karpathy}, A. and {Johnson}, J. and {Fei-Fei}, L.",
  eprint       = "1506.02078",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Computer Science - Neural and Evolutionary
                  Computing",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{Visualizing and Understanding Recurrent Networks}",
  year         = 2015
}

@ARTICLE{graves-2014-neural-turin-machin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.5401G",
  archivePrefix= "arXiv",
  author       = "{Graves}, A. and {Wayne}, G. and {Danihelka}, I.",
  eprint       = "1410.5401",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing",
  month        = oct,
  title        = "{Neural Turing Machines}",
  year         = 2014
}

@ARTICLE{bahdanau-2014-bahdanau-attention,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1409.0473B",
  archivePrefix= "arXiv",
  author       = "{Bahdanau}, D. and {Cho}, K. and {Bengio}, Y.",
  eprint       = "1409.0473",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing, Statistics
  - Machine Learning",
  month        = sep,
  primaryClass = "cs.CL",
  title        = "{Neural Machine Translation By Jointly Learning To Align and
                  Translate}",
  year         = 2014
}

@ARTICLE{vinyals-2014-show-tell,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1411.4555V",
  archivePrefix= "arXiv",
  author       = "{Vinyals}, O. and {Toshev}, A. and {Bengio}, S. and {Erhan},
                  D.",
  eprint       = "1411.4555",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = nov,
  primaryClass = "cs.CV",
  title        = "{Show and Tell: A Neural Image Caption Generator}",
  year         = 2014
}

@inproceedings{glorot-2010-under-diff,
  title        = "Understanding the difficulty of training deep feedforward
                  neural networks.",
  author       = "Glorot, Xavier and Bengio, Yoshua",
  booktitle    = "Aistats",
  volume       = 9,
  pages        = "249-256",
  year         = 2010
}

@article{lecun-1998-gradient-based,
  title        = "Gradient-based learning applied to document recognition",
  author       = "LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and
                  Haffner, Patrick",
  journal      = "Proceedings of the IEEE",
  volume       = 86,
  number       = 11,
  pages        = "2278-2324",
  year         = 1998,
  publisher    = "IEEE"
}

@article{gosavi-2009-reinforcement-learning,
  title        = "Reinforcement learning: A tutorial survey and recent
  advances",
  author       = "Gosavi, Abhijit",
  journal      = "INFORMS Journal on Computing",
  volume       = 21,
  number       = 2,
  pages        = "178-192",
  year         = 2009,
  publisher    = "INFORMS"
}

@ARTICLE{zeiler-2013-visual-under-convol-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1311.2901Z",
  archivePrefix= "arXiv",
  author       = "{Zeiler}, M.~D and {Fergus}, R.",
  eprint       = "1311.2901",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = nov,
  primaryClass = "cs.CV",
  title        = "{Visualizing and Understanding Convolutional Networks}",
  year         = 2013
}

@ARTICLE{krizhevsky-2014-one-weird,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1404.5997K",
  archivePrefix= "arXiv",
  author       = "{Krizhevsky}, A.",
  eprint       = "1404.5997",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Distributed, Parallel, and Cluster
                  Computing, Computer Science - Learning",
  month        = apr,
  title        = "{One Weird Trick for Parallelizing Convolutional Neural
                  networks}",
  year         = 2014
}

@inproceedings{zeiler-2011-adaptive-deconv,
  title        = "Adaptive deconvolutional networks for mid and high level
                  feature learning",
  author       = "Zeiler, Matthew D and Taylor, Graham W and Fergus, Rob",
  booktitle    = "Computer Vision (ICCV), 2011 IEEE International Conference
  on",
  pages        = "2018-2025",
  year         = 2011,
  organization = "IEEE"
}

@ARTICLE{dumoulin-2016-guide-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160307285D",
  archivePrefix= "arXiv",
  author       = "{Dumoulin}, V. and {Visin}, F.",
  eprint       = "1603.07285",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning,
                  Computer Science - Neural and Evolutionary Computing",
  month        = mar,
  primaryClass = "stat.ML",
  title        = "{A Guide To Convolution Arithmetic for Deep learning}",
  year         = 2016
}

@article{beck-2009-fast-iter,
  title        = "A fast iterative shrinkage-thresholding algorithm for linear
                  inverse problems",
  author       = "Beck, Amir and Teboulle, Marc",
  journal      = "SIAM journal on imaging sciences",
  volume       = 2,
  number       = 1,
  pages        = "183-202",
  year         = 2009,
  publisher    = "SIAM"
}

@ARTICLE{redmon-2015-you-only-look-once,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150602640R",
  archivePrefix= "arXiv",
  author       = "{Redmon}, J. and {Divvala}, S. and {Girshick}, R. and
                  {Farhadi}, A.",
  eprint       = "1506.02640",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = jun,
  primaryClass = "cs.CV",
  title        = "{You Only Look Once: Unified, Real-Time Object Detection}",
  year         = 2015
}

@ARTICLE{rastegari-2016-xnor-net,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160305279R",
  archivePrefix= "arXiv",
  author       = "{Rastegari}, M. and {Ordonez}, V. and {Redmon}, J. and
                  {Farhadi}, A.",
  eprint       = "1603.05279",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = mar,
  primaryClass = "cs.CV",
  title        = "{XNOR-Net: Imagenet Classification Using Binary Convolutional
                  Neural Networks}",
  year         = 2016
}

@inproceedings{zeiler-2010-deconvolutional-net,
  title        = "Deconvolutional networks",
  author       = "Zeiler, Matthew D and Krishnan, Dilip and Taylor, Graham W
                  and Fergus, Rob",
  booktitle    = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE
                  Conference on",
  pages        = "2528-2535",
  year         = 2010,
  organization = "IEEE"
}

@ARTICLE{mikolov-2013-effic-estim,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1301.3781M",
  archivePrefix= "arXiv",
  author       = "{Mikolov}, T. and {Chen}, K. and {Corrado}, G. and {Dean},
  J.",
  eprint       = "1301.3781",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jan,
  primaryClass = "cs.CL",
  title        = "{Efficient Estimation of Word Representations in Vector
                  Space}",
  year         = 2013
}

@ARTICLE{greff-2015-lstm-search,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150304069G",
  archivePrefix= "arXiv",
  author       = "{Greff}, K. and {Srivastava}, R.~K. and {Koutn{\'{\i}}k},
                  J. and {Steunebrink}, B.~R. and {Schmidhuber}, J.",
  eprint       = "1503.04069",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Learning, 68T10, I.2.6, I.2.7, I.5.1,
                  H.5.5",
  month        = mar,
  title        = "{LSTM: A Search Space Odyssey}",
  year         = 2015
}

@ARTICLE{mnih-2016-async-method,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160201783M",
  archivePrefix= "arXiv",
  author       = "{Mnih}, V. and {Puigdom{\`e}nech Badia}, A. and {Mirza},
                  M. and {Graves}, A. and {Lillicrap}, T.~P. and {Harley},
                  T. and {Silver}, D. and {Kavukcuoglu}, K.",
  eprint       = "1602.01783",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Asynchronous Methods for Deep Reinforcement Learning}",
  year         = 2016
}

@article{bengio-2003-neural-prob,
  title        = "A neural probabilistic language model",
  author       = "Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal
                  and Jauvin, Christian",
  journal      = "Journal of machine learning research",
  volume       = 3,
  number       = "Feb",
  pages        = "1137-1155",
  year         = 2003
}

@phdthesis{mikolov-2007-language-model,
  title        = "Language Modeling for Speech Recognition in Czech",
  author       = "Mikolov, Tom{\'a}{\v{s}}",
  year         = 2007,
  school       = "Masters thesis, Brno University of Technology"
}

@inproceedings{mikolov-2013-distributed-repre,
  title        = "Distributed representations of words and phrases and their
                  compositionality",
  author       = "Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado,
                  Greg S and Dean, Jeff",
  booktitle    = "Advances in neural information processing systems",
  pages        = "3111-3119",
  year         = 2013
}

@ARTICLE{luong-2015-luong-attention,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150804025L",
  archivePrefix= "arXiv",
  author       = "{Luong}, M.-T. and {Pham}, H. and {Manning}, C.~D.",
  eprint       = "1508.04025",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL",
  title        = "{Effective Approaches To Attention-Based Neural Machine
                  Translation}",
  year         = 2015
}

@ARTICLE{chorowski-2015-atten-based,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150607503C",
  archivePrefix= "arXiv",
  author       = "{Chorowski}, J. and {Bahdanau}, D. and {Serdyuk}, D. and
                  {Cho}, K. and {Bengio}, Y.",
  eprint       = "1506.07503",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing, Statistics
  - Machine Learning",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Attention-Based Models for Speech Recognition}",
  year         = 2015
}

@ARTICLE{mnih-2014-recur-model-visual-atten,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1406.6247M",
  archivePrefix= "arXiv",
  author       = "{Mnih}, V. and {Heess}, N. and {Graves}, A. and
                  {Kavukcuoglu}, K.",
  eprint       = "1406.6247",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computer
                  Vision and Pattern Recognition, Statistics - Machine
                  Learning",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{Recurrent Models of Visual Attention}",
  year         = 2014
}

@ARTICLE{ba-2014-multip-objec,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.7755B",
  archivePrefix= "arXiv",
  author       = "{Ba}, J. and {Mnih}, V. and {Kavukcuoglu}, K.",
  eprint       = "1412.7755",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computer
                  Vision and Pattern Recognition, Computer Science - Neural and
                  Evolutionary Computing",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{Multiple Object Recognition With Visual Attention}",
  year         = 2014
}

@ARTICLE{gregor-2015-draw-recur,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150204623G",
  archivePrefix= "arXiv",
  author       = "{Gregor}, K. and {Danihelka}, I. and {Graves}, A. and
                  {Jimenez Rezende}, D. and {Wierstra}, D.",
  eprint       = "1502.04623",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = feb,
  primaryClass = "cs.CV",
  title        = "{DRAW: A Recurrent Neural Network For Image Generation}",
  year         = 2015
}

@ARTICLE{xu-2015-show-atten-tell,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150203044X",
  archivePrefix= "arXiv",
  author       = "{Xu}, K. and {Ba}, J. and {Kiros}, R. and {Cho}, K. and
                  {Courville}, A. and {Salakhutdinov}, R. and {Zemel}, R. and
                  {Bengio}, Y.",
  eprint       = "1502.03044",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computer
                  Vision and Pattern Recognition",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Show, Attend and Tell: Neural Image Caption Generation With
                  Visual Attention}",
  year         = 2015
}

@ARTICLE{zaremba-2014-learn-to-execute,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.4615Z",
  archivePrefix= "arXiv",
  author       = "{Zaremba}, W. and {Sutskever}, I.",
  eprint       = "1410.4615",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Learning",
  month        = oct,
  title        = "{Learning To Execute}",
  year         = 2014
}

@ARTICLE{vinyals-2014-gramm-as-foreig-languag,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.7449V",
  archivePrefix= "arXiv",
  author       = "{Vinyals}, O. and {Kaiser}, L. and {Koo}, T. and {Petrov},
                  S. and {Sutskever}, I. and {Hinton}, G.",
  eprint       = "1412.7449",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Statistics - Machine Learning",
  month        = dec,
  primaryClass = "cs.CL",
  title        = "{Grammar As a Foreign Language}",
  year         = 2014
}

@ARTICLE{hermann-2015-teach-machin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150603340H",
  archivePrefix= "arXiv",
  author       = "{Hermann}, K.~M. and {Ko{\v c}isk{\'y}}, T. and
                  {Grefenstette}, E. and {Espeholt}, L. and {Kay}, W. and
                  {Suleyman}, M. and {Blunsom}, P.",
  eprint       = "1506.03340",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Neural and Evolutionary
  Computing",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Teaching Machines To Read and Comprehend}",
  year         = 2015
}

@ARTICLE{sukhbaatar-2015-end-to-end,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150308895S",
  archivePrefix= "arXiv",
  author       = "{Sukhbaatar}, S. and {Szlam}, A. and {Weston}, J. and
                  {Fergus}, R.",
  eprint       = "1503.08895",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Computation and Language",
  month        = mar,
  title        = "{End-To-End Memory Networks}",
  year         = 2015
}

@ARTICLE{zaremba-2015-reinf-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150500521Z",
  archivePrefix= "arXiv",
  author       = "{Zaremba}, W. and {Sutskever}, I.",
  eprint       = "1505.00521",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = may,
  primaryClass = "cs.LG",
  title        = "{Reinforcement Learning Neural Turing Machines - Revised}",
  year         = 2015
}

@ARTICLE{joulin-2016-bag-trick,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160701759J",
  archivePrefix= "arXiv",
  author       = "{Joulin}, A. and {Grave}, E. and {Bojanowski}, P. and
                  {Mikolov}, T.",
  eprint       = "1607.01759",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jul,
  primaryClass = "cs.CL",
  title        = "{Bag of Tricks for Efficient Text Classification}",
  year         = 2016
}

@ARTICLE{kim-2014-convol-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1408.5882K",
  archivePrefix= "arXiv",
  author       = "{Kim}, Y.",
  eprint       = "1408.5882",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = aug,
  primaryClass = "cs.CL",
  title        = "{Convolutional Neural Networks for Sentence Classification}",
  year         = 2014
}

@ARTICLE{graves-2013-gener-sequen,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1308.0850G",
  archivePrefix= "arXiv",
  author       = "{Graves}, A.",
  eprint       = "1308.0850",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Computation and Language",
  month        = aug,
  title        = "{Generating Sequences With Recurrent Neural Networks}",
  year         = 2013
}

@inproceedings{yang-2016-hierarchical-attent,
  title        = "Hierarchical attention networks for document classification",
  author       = "Yang, Zichao and Yang, Diyi and Dyer, Chris and He, Xiaodong
                  and Smola, Alex and Hovy, Eduard",
  booktitle    = "Proceedings of NAACL-HLT",
  pages        = "1480-1489",
  year         = 2016
}

@inproceedings{lai-2015-recurrent-conv,
  title        = "Recurrent Convolutional Neural Networks for Text
                  Classification.",
  author       = "Lai, Siwei and Xu, Liheng and Liu, Kang and Zhao, Jun",
  booktitle    = "AAAI",
  volume       = 333,
  pages        = "2267-2273",
  year         = 2015
}

@inproceedings{ding-2015-deep-learn,
  title        = "Deep Learning for Event-Driven Stock Prediction.",
  author       = "Ding, Xiao and Zhang, Yue and Liu, Ting and Duan, Junwen",
  booktitle    = "IJCAI",
  pages        = "2327-2333",
  year         = 2015
}

@inproceedings{socher-2013-reasoning-neural,
  title        = "Reasoning with neural tensor networks for knowledge base
                  completion",
  author       = "Socher, Richard and Chen, Danqi and Manning, Christopher D
                  and Ng, Andrew",
  booktitle    = "Advances in neural information processing systems",
  pages        = "926-934",
  year         = 2013
}

@inproceedings{angeli-2015-lever-ling-struct,
  title        = "Leveraging linguistic structure for open domain information
                  extraction",
  author       = "Angeli, Gabor and Premkumar, Melvin Johnson and Manning,
                  Christopher D",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics (ACL 2015)",
  year         = 2015
}

@inproceedings{benko-2007-open-info-extra,
  author       = "Banko, Michele and Cafarella, Michael J. and Soderland,
                  Stephen and Broadhead, Matt and Etzioni, Oren",
  title        = "Open Information Extraction from the Web",
  booktitle    = "Proceedings of the 20th International Joint Conference on
                  Artifical Intelligence",
  series       = "IJCAI'07",
  year         = 2007,
  location     = "Hyderabad, India",
  pages        = "2670-2676",
  numpages     = 7,
  url          = "http://dl.acm.org/citation.cfm?id=1625275.1625705",
  acmid        = 1625705,
  publisher    = "Morgan Kaufmann Publishers Inc.",
  address      = "San Francisco, CA, USA"
}

@inproceedings{si-2014-exploit-social,
  title        = "Exploiting Social Relations and Sentiment for Stock
                  Prediction.",
  author       = "Si, Jianfeng and Mukherjee, Arjun and Liu, Bing and Pan,
                  Sinno Jialin and Li, Qing and Li, Huayi",
  booktitle    = "EMNLP",
  volume       = 14,
  pages        = "1139-1145",
  year         = 2014
}

@inproceedings{ding-2014-usings-struct-event,
  title        = "Using Structured Events to Predict Stock Price Movement: An
                  Empirical Investigation",
  author       = "Xiao Ding and Yue Zhang and Ting Liu and Junwen Duan",
  booktitle    = "EMNLP",
  year         = 2014
}

@inproceedings{pennington-2014-glove-global-vec,
  author       = "Jeffrey Pennington and Richard Socher and Christopher
                  D. Manning",
  booktitle    = "Empirical Methods in Natural Language Processing (EMNLP)",
  title        = "GloVe: Global Vectors for Word Representation",
  year         = 2014,
  pages        = "1532-1543",
  url          = "http://www.aclweb.org/anthology/D14-1162"
}

@inproceedings{baroni-2014-dont-count-pred,
  title        = "Don't count, predict! A systematic comparison of
                  context-counting vs. context-predicting semantic vectors",
  author       = "Marco Baroni and Georgiana Dinu and Germ{\'a}n Kruszewski",
  booktitle    = "ACL",
  year         = 2014
}

@inproceedings{morin-2005-hiera-prob,
  title        = "Hierarchical Probabilistic Neural Network Language Model.",
  author       = "Morin, Frederic and Bengio, Yoshua",
  booktitle    = "Aistats",
  volume       = 5,
  pages        = "246-252",
  year         = 2005,
  organization = "Citeseer"
}

@ARTICLE{rong-2014-param-learn-explain,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1411.2738R",
  archivePrefix= "arXiv",
  author       = "{Rong}, X.",
  eprint       = "1411.2738",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{word2vec Parameter Learning Explained}",
  year         = 2014
}

@inproceedings{mnih-2009-scalable-hiera,
  title        = "A scalable hierarchical distributed language model",
  author       = "Mnih, Andriy and Hinton, Geoffrey E",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1081-1088",
  year         = 2009
}

@inproceedings{davis-2006-rela-pre-rec,
  title        = "The relationship between Precision-Recall and ROC curves",
  author       = "Davis, Jesse and Goadrich, Mark",
  booktitle    = "Proceedings of the 23rd international conference on Machine
                  learning",
  pages        = "233-240",
  year         = 2006,
  organization = "ACM"
}

@ARTICLE{wojna-2017-atten-based,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170403549W",
  archivePrefix= "arXiv",
  author       = "{Wojna}, Z. and {Gorban}, A. and {Lee}, D.-S. and {Murphy},
                  K. and {Yu}, Q. and {Li}, Y. and {Ibarz}, J.",
  eprint       = "1704.03549",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = apr,
  primaryClass = "cs.CV",
  title        = "{Attention-Based Extraction of Structured Information From
                  Street View Imagery}",
  year         = 2017
}

@inproceedings{lee-2013-pesu-label,
  title        = {Pseudo-Label : The Simple and Efficient Semi-Supervised
                  Learning Method for Deep Neural Networks},
  author       = {Dong-Hyun Lee},
  year         = 2013
}

@ARTICLE{gehring-2017-convol-sequen,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170503122G",
  archivePrefix= "arXiv",
  author       = "{Gehring}, J. and {Auli}, M. and {Grangier}, D. and {Yarats},
                  D. and {Dauphin}, Y.~N.",
  eprint       = "1705.03122",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = may,
  primaryClass = "cs.CL",
  title        = "{Convolutional Sequence To Sequence Learning}",
  year         = 2017
}

@inproceedings{dean-2012-large-scale,
  title        = "Large scale distributed deep networks",
  author       = "Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen,
                  Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and
                  Tucker, Paul and Yang, Ke and Le, Quoc V and others",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1223-1231",
  year         = 2012
}

@inproceedings{larochelle-2010-learn-combine,
  title        = "Learning to combine foveal glimpses with a third-order
                  Boltzmann machine",
  author       = "Larochelle, Hugo and Hinton, Geoffrey E",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1243-1251",
  year         = 2010
}

@ARTICLE{denil-2011-learn-where,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2011arXiv1109.3737D",
  archivePrefix= "arXiv",
  author       = "{Denil}, M. and {Bazzani}, L. and {Larochelle}, H. and {de
                  Freitas}, N.",
  eprint       = "1109.3737",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Artificial Intelligence",
  month        = sep,
  primaryClass = "cs.AI",
  title        = "{Learning Where To Attend With Deep Architectures for Image
                  Tracking}",
  year         = 2011
}

@ARTICLE{yin-2015-abcnn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151205193Y",
  archivePrefix= "arXiv",
  author       = "{Yin}, W. and {Sch{\"u}tze}, H. and {Xiang}, B. and {Zhou},
                  B.",
  eprint       = "1512.05193",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = dec,
  primaryClass = "cs.CL",
  title        = "{ABCNN: Attention-Based Convolutional Neural Network for
                  Modeling Sentence Pairs}",
  year         = 2015
}

@ARTICLE{vinyals-2015-point-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150603134V",
  archivePrefix= "arXiv",
  author       = "{Vinyals}, O. and {Fortunato}, M. and {Jaitly}, N.",
  eprint       = "1506.03134",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science -
                  Computational Geometry, Computer Science - Learning, Computer
                  Science - Neural and Evolutionary Computing",
  month        = jun,
  primaryClass = "stat.ML",
  title        = "{Pointer Networks}",
  year         = 2015
}

@ARTICLE{johnson-2016-percep-losses,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160308155J",
  archivePrefix= "arXiv",
  author       = "{Johnson}, J. and {Alahi}, A. and {Fei-Fei}, L.",
  eprint       = "1603.08155",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning",
  month        = mar,
  primaryClass = "cs.CV",
  title        = "{Perceptual Losses for Real-Time Style Transfer and
                  Super-Resolution}",
  year         = 2016
}

@inproceedings{lu-2012-combin-sketch,
  title        = "Combining sketch and tone for pencil drawing production",
  author       = "Lu, Cewu and Xu, Li and Jia, Jiaya",
  booktitle    = "Proceedings of the Symposium on Non-Photorealistic Animation
                  and Rendering",
  pages        = "65-73",
  year         = 2012,
  organization = "Eurographics Association"
}

@ARTICLE{chan-2015-pcanet,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015ITIP...24.5017C",
  archivePrefix= "arXiv",
  author       = "{Chan}, T.-H. and {Jia}, K. and {Gao}, S. and {Lu}, J. and
                  {Zeng}, Z. and {Ma}, Y.",
  doi          = "10.1109/TIP.2015.2475625",
  eprint       = "1404.3606",
  journal      = "IEEE Transactions on Image Processing",
  month        = dec,
  pages        = "5017-5032",
  primaryClass = "cs.CV",
  title        = "{PCANet: A Simple Deep Learning Baseline for Image
                  Classification?}",
  url          = "https://doi.org/10.1109/TIP.2015.2475625",
  volume       = 24,
  year         = 2015
}

@ARTICLE{kiros-2015-skip-thought,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150606726K",
  archivePrefix= "arXiv",
  author       = "{Kiros}, R. and {Zhu}, Y. and {Salakhutdinov}, R. and
                  {Zemel}, R.~S. and {Torralba}, A. and {Urtasun}, R. and
                  {Fidler}, S.",
  eprint       = "1506.06726",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Skip-Thought Vectors}",
  year         = 2015
}

@inproceedings{manning-2014-stanford-core,
  title        = "The stanford corenlp natural language processing toolkit.",
  author       = "Manning, Christopher D and Surdeanu, Mihai and Bauer, John
                  and Finkel, Jenny Rose and Bethard, Steven and McClosky,
                  David",
  booktitle    = "ACL (System Demonstrations)",
  pages        = "55-60",
  year         = 2014
}

@article{graves-2005-frame-phone,
  title        = "Framewise phoneme classification with bidirectional LSTM and
                  other neural network architectures",
  author       = "Graves, Alex and Schmidhuber, J{\"u}rgen",
  journal      = "Neural Networks",
  volume       = 18,
  number       = 5,
  pages        = "602-610",
  year         = 2005,
  publisher    = "Elsevier"
}

@inproceedings{graves-2006-conn-temp,
  title        = "Connectionist temporal classification: labelling unsegmented
                  sequence data with recurrent neural networks",
  author       = "Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino
                  and Schmidhuber, J{\"u}rgen",
  booktitle    = "Proceedings of the 23rd international conference on Machine
                  learning",
  pages        = "369-376",
  year         = 2006,
  organization = "ACM"
}

@ARTICLE{szegedy-2015-rethin-incep,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151200567S",
  archivePrefix= "arXiv",
  author       = "{Szegedy}, C. and {Vanhoucke}, V. and {Ioffe}, S. and
                  {Shlens}, J. and {Wojna}, Z.",
  eprint       = "1512.00567",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = dec,
  primaryClass = "cs.CV",
  title        = "{Rethinking the Inception Architecture for Computer Vision}",
  year         = 2015
}

@article{davis-1959-leonhard-euler,
  title        = "Leonhard euler's integral: A historical profile of the gamma
                  function: In memoriam: Milton abramowitz",
  author       = "Davis, Philip J",
  journal      = "The American Mathematical Monthly",
  volume       = 66,
  number       = 10,
  pages        = "849-869",
  year         = 1959,
  publisher    = "JSTOR"
}

@article{heinrich-2008-param-esti,
  title        = "Parameter estimation for text analysis",
  author       = "Heinrich, Gregor",
  journal      = "''",
  year         = 2008
}

@article{takacs-2007-major-comp,
  title        = "Major components of the gravity recommendation system",
  author       = "Tak{\'a}cs, G{\'a}bor and Pil{\'a}szy, Istv{\'a}n and
                  N{\'e}meth, Botty{\'a}n and Tikk, Domonkos",
  journal      = "ACM SIGKDD Explorations Newsletter",
  volume       = 9,
  number       = 2,
  pages        = "80-83",
  year         = 2007,
  publisher    = "ACM"
}

@article{takacs-2007-gravity-recomm,
  title        = "On the Gravity Recommendation System",
  author       = "Takacs, Gabor and Pilaszy, Istvan and Nemeth, Bottyan and
                  Tikk, Domonkos",
  year         = 2007,
  publisher    = "Citeseer",
  journal      = "''"
}

@inproceedings{yang-2015-net-repr,
  title        = "Network representation learning with rich text information",
  author       = "Yang, Cheng and Liu, Zhiyuan and Zhao, Deli and Sun, Maosong
                  and Chang, Edward",
  booktitle    = "Twenty-Fourth International Joint Conference on Artificial
                  Intelligence",
  year         = 2015
}

@ARTICLE{perozzi-2014-deepw,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1403.6652P",
  archivePrefix= "arXiv",
  author       = "{Perozzi}, B. and {Al-Rfou}, R. and {Skiena}, S.",
  eprint       = "1403.6652",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Social and Information Networks, Computer
                  Science - Learning, H.2.8, I.2.6, I.5.1",
  month        = mar,
  title        = "{DeepWalk: Online Learning of Social Representations}",
  year         = 2014
}

@article{tibshirani-1996-regres-shrink,
  title        = "Regression shrinkage and selection via the lasso",
  author       = "Tibshirani, Robert",
  journal      = "Journal of the Royal Statistical Society. Series B
                  (Methodological)",
  pages        = "267-288",
  year         = 1996,
  publisher    = "JSTOR"
}

@ARTICLE{collobert-2011-natur-languag,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2011arXiv1103.0398C",
  archivePrefix= "arXiv",
  author       = "{Collobert}, R. and {Weston}, J. and {Bottou}, L. and
                  {Karlen}, M. and {Kavukcuoglu}, K. and {Kuksa}, P.",
  eprint       = "1103.0398",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = mar,
  primaryClass = "cs.LG",
  title        = "{Natural Language Processing (almost) From Scratch}",
  year         = 2011
}

@ARTICLE{jin-2017-how-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170300887J",
  archivePrefix= "arXiv",
  author       = "{Jin}, C. and {Ge}, R. and {Netrapalli}, P. and {Kakade},
                  S.~M. and {Jordan}, M.~I.",
  eprint       = "1703.00887",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Mathematics - Optimization and
                  Control, Statistics - Machine Learning",
  month        = mar,
  primaryClass = "cs.LG",
  title        = "{How To Escape Saddle Points Efficiently}",
  year         = 2017
}

@inproceedings{mihalcea-2004-textrank-bring,
  title        = "TextRank: Bringing order into texts",
  author       = "Mihalcea, Rada and Tarau, Paul",
  year         = 2004,
  organization = "Association for Computational Linguistics",
  booktitle    = "''"
}

@article{wang-2009-more-suit,
  title        = "Which is More Suitable for Chinese Word Segmentation, the
                  Generative Model or the Discriminative One? F",
  author       = "Wang, Kun and Zong, Chengqing",
  year         = 2009,
  journal      = "''"
}

@inproceedings{wang-2011-online-varia,
  title        = "Online variational inference for the hierarchical Dirichlet
                  process",
  author       = "Wang, Chong and Paisley, John and Blei, David",
  booktitle    = "Proceedings of the Fourteenth International Conference on
                  Artificial Intelligence and Statistics",
  pages        = "752-760",
  year         = 2011
}

@inproceedings{hoffman-2010-online-learn,
  title        = "Online learning for latent dirichlet allocation",
  author       = "Hoffman, Matthew and Bach, Francis R and Blei, David M",
  booktitle    = "advances in neural information processing systems",
  pages        = "856-864",
  year         = 2010
}

@article{yamamoto-2001-using-suffix,
  title        = "Using suffix arrays to compute term frequency and document
                  frequency for all substrings in a corpus",
  author       = "Yamamoto, Mikio and Church, Kenneth W",
  journal      = "Computational Linguistics",
  volume       = 27,
  number       = 1,
  pages        = "1-30",
  year         = 2001,
  publisher    = "MIT press"
}

@article{etzioni-2005-unsupervised-named-entity,
  title        = "Unsupervised named-entity extraction from the web: An
                  experimental study",
  author       = "Etzioni, Oren and Cafarella, Michael and Downey, Doug and
                  Popescu, Ana-Maria and Shaked, Tal and Soderland, Stephen and
                  Weld, Daniel S and Yates, Alexander",
  journal      = "Artificial intelligence",
  volume       = 165,
  number       = 1,
  pages        = "91-134",
  year         = 2005,
  publisher    = "Elsevier"
}

@inproceedings{singh-2010-minimally-super,
  title        = "Minimally-supervised extraction of entities from text
                  advertisements",
  author       = "Singh, Sameer and Hillard, Dustin and Leggetter, Chris",
  booktitle    = "Human Language Technologies: The 2010 Annual Conference of
                  the North American Chapter of the Association for
                  Computational Linguistics",
  pages        = "73-81",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@inproceedings{liu-2011-recogn-named,
  title        = "Recognizing named entities in tweets",
  author       = "Liu, Xiaohua and Zhang, Shaodian and Wei, Furu and Zhou,
  Ming",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies-Volume
                  1",
  pages        = "359-367",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@ARTICLE{lample-2016-neural-archit,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160301360L",
  archivePrefix= "arXiv",
  author       = "{Lample}, G. and {Ballesteros}, M. and {Subramanian}, S. and
                  {Kawakami}, K. and {Dyer}, C.",
  eprint       = "1603.01360",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Neural Architectures for Named Entity Recognition}",
  year         = 2016
}

@ARTICLE{rei-2016-atten-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161104361R",
  archivePrefix= "arXiv",
  author       = "{Rei}, M. and {Crichton}, G.~K.~O. and {Pyysalo}, S.",
  eprint       = "1611.04361",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing, I.5.1,
  I.2.6, I.2.7",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{Attending To Characters in Neural Sequence Labeling Models}",
  year         = 2016
}

@inproceedings{bharadwaj-2016-phono-aware,
  title        = "Phonologically Aware Neural Model for Named Entity
                  Recognition in Low Resource Transfer Settings",
  author       = "Akash Bharadwaj and David R. Mortensen and Chris Dyer and
                  Jaime G. Carbonell",
  booktitle    = "EMNLP",
  year         = 2016
}

@ARTICLE{yang-2017-trans-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170306345Y",
  archivePrefix= "arXiv",
  author       = "{Yang}, Z. and {Salakhutdinov}, R. and {Cohen}, W.~W.",
  eprint       = "1703.06345",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Transfer Learning for Sequence Tagging With Hierarchical
                  Recurrent Networks}",
  year         = 2017
}

@ARTICLE{peters-2017-semi-super,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170500108P",
  archivePrefix= "arXiv",
  author       = "{Peters}, M.~E. and {Ammar}, W. and {Bhagavatula}, C. and
                  {Power}, R.",
  eprint       = "1705.00108",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = apr,
  primaryClass = "cs.CL",
  title        = "{Semi-Supervised Sequence Tagging With Bidirectional Language
                  models}",
  year         = 2017
}

@article{rose-2010-auto-key,
  title        = "Automatic keyword extraction from individual documents",
  author       = "Rose, Stuart and Engel, Dave and Cramer, Nick and Cowley,
                  Wendy",
  journal      = "Text Mining",
  pages        = "1-20",
  year         = 2010
}

@inproceedings{hasan-2014-auto-key,
  title        = "Automatic Keyphrase Extraction: A Survey of the State of the
                  Art.",
  author       = "Hasan, Kazi Saidul and Ng, Vincent",
  booktitle    = "ACL (1)",
  pages        = "1262-1273",
  year         = 2014
}

@inproceedings{yih-2006-find-advert,
  title        = "Finding advertising keywords on web pages",
  author       = "Yih, Wen-tau and Goodman, Joshua and Carvalho, Vitor R",
  booktitle    = "Proceedings of the 15th international conference on World
                  Wide Web",
  pages        = "213-222",
  year         = 2006,
  organization = "ACM"
}

@inproceedings{jiang-2009-rank-approach,
  title        = "A ranking approach to keyphrase extraction",
  author       = "Jiang, Xin and Hu, Yunhua and Li, Hang",
  booktitle    = "Proceedings of the 32nd international ACM SIGIR conference on
                  Research and development in information retrieval",
  pages        = "756-757",
  year         = 2009,
  organization = "ACM"
}

@inproceedings{liu-2009-unsupervised-approach,
  title        = "Unsupervised approaches for automatic keyword extraction
                  using meeting transcripts",
  author       = "Liu, Feifan and Pennell, Deana and Liu, Fei and Liu, Yang",
  booktitle    = "Proceedings of human language technologies: The 2009 annual
                  conference of the North American chapter of the association
                  for computational linguistics",
  pages        = "620-628",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{witten-1999-kea,
  author       = "Witten, Ian H. and Paynter, Gordon W. and Frank, Eibe and
                  Gutwin, Carl and Nevill-Manning, Craig G.",
  title        = "KEA: Practical Automatic Keyphrase Extraction",
  booktitle    = "Proceedings of the Fourth ACM Conference on Digital
  Libraries",
  series       = "DL '99",
  year         = 1999,
  isbn         = "1-58113-145-3",
  location     = "Berkeley, California, USA",
  pages        = "254-255",
  numpages     = 2,
  url          = "http://doi.acm.org/10.1145/313238.313437",
  doi          = "10.1145/313238.313437",
  acmid        = 313437,
  publisher    = "ACM",
  address      = "New York, NY, USA"
}

@inproceedings{liu-2010-auto-key,
  title        = "Automatic keyphrase extraction via topic decomposition",
  author       = "Liu, Zhiyuan and Huang, Wenyi and Zheng, Yabin and Sun,
                  Maosong",
  booktitle    = "Proceedings of the 2010 conference on empirical methods in
                  natural language processing",
  pages        = "366-376",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@article{chuang-2012-without-cluster,
  title        = "“Without the Clutter of Unimportant Words”: Descriptive
                  keyphrases for text visualization",
  author       = "Chuang, Jason and Manning, Christopher D and Heer, Jeffrey",
  journal      = "ACM Transactions on Computer-Human Interaction (TOCHI)",
  volume       = 19,
  number       = 3,
  pages        = 19,
  year         = 2012,
  publisher    = "ACM"
}

@inproceedings{mei-2010-divrank,
  title        = "Divrank: the interplay of prestige and diversity in
                  information networks",
  author       = "Mei, Qiaozhu and Guo, Jian and Radev, Dragomir",
  booktitle    = "Proceedings of the 16th ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  pages        = "1009-1018",
  year         = 2010,
  organization = "Acm"
}

@inproceedings{hasan-2010-conundrums-unspervised,
  title        = "Conundrums in unsupervised keyphrase extraction: making sense
                  of the state-of-the-art",
  author       = "Hasan, Kazi Saidul and Ng, Vincent",
  booktitle    = "Proceedings of the 23rd International Conference on
                  Computational Linguistics: Posters",
  pages        = "365-373",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@inproceedings{wan-2008-single-doc,
  title        = "Single document keyphrase extraction using neighborhood
                  knowledge",
  author       = "Wan, Xiaojun and Xiao, Jianguo",
  booktitle    = "Proceedings of the 23rd national conference on Artificial
                  intelligence-Volume 2",
  pages        = "855-860",
  year         = 2008,
  organization = "AAAI Press"
}

@inproceedings{wan-2008-collabrank,
  title        = "CollabRank: towards a collaborative approach to
                  single-document keyphrase extraction",
  author       = "Wan, Xiaojun and Xiao, Jianguo",
  booktitle    = "Proceedings of the 22nd International Conference on
                  Computational Linguistics-Volume 1",
  pages        = "969-976",
  year         = 2008,
  organization = "Association for Computational Linguistics"
}

@techreport{page-1999-page-rank,
  title        = "The PageRank citation ranking: Bringing order to the web.",
  author       = "Page, Lawrence and Brin, Sergey and Motwani, Rajeev and
                  Winograd, Terry",
  year         = 1999,
  institution  = "Stanford InfoLab"
}

@ARTICLE{barrios-2016-variat-simil,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160203606B",
  archivePrefix= "arXiv",
  author       = "{Barrios}, F. and {L{\'o}pez}, F. and {Argerich}, L. and
                  {Wachenchauzer}, R.",
  eprint       = "1602.03606",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Information Retrieval, I.2.7",
  month        = feb,
  primaryClass = "cs.CL",
  title        = "{Variations of the Similarity Function of Textrank for
                  Automated Summarization}",
  year         = 2016
}

@article{gimpel-2006-model-topics,
  title        = "Modeling Topics",
  author       = "Gimpel, Kevin",
  year         = 2006,
  journal      = "''"
}

@ARTICLE{salimans-2016-weigh-normal,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160207868S",
  archivePrefix= "arXiv",
  author       = "{Salimans}, T. and {Kingma}, D.~P.",
  eprint       = "1602.07868",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Neural and Evolutionary
                  Computing",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Weight Normalization: A Simple Reparameterization To
                  Accelerate Training of Deep Neural Networks}",
  year         = 2016
}

@ARTICLE{lei-2016-layer-normal,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160706450L",
  archivePrefix= "arXiv",
  author       = "{Lei Ba}, J. and {Kiros}, J.~R. and {Hinton}, G.~E.",
  eprint       = "1607.06450",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = jul,
  primaryClass = "stat.ML",
  title        = "{Layer Normalization}",
  year         = 2016
}

@ARTICLE{ioffe-2015-batch-normal,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150203167I",
  archivePrefix= "arXiv",
  author       = "{Ioffe}, S. and {Szegedy}, C.",
  eprint       = "1502.03167",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Batch Normalization: Accelerating Deep Network Training By
                  Reducing Internal Covariate Shift}",
  year         = 2015
}

@article{shimodaira-2000-improv-predict,
  title        = "Improving predictive inference under covariate shift by
                  weighting the log-likelihood function",
  author       = "Shimodaira, Hidetoshi",
  journal      = "Journal of statistical planning and inference",
  volume       = 90,
  number       = 2,
  pages        = "227-244",
  year         = 2000,
  publisher    = "Elsevier"
}

@ARTICLE{mikolov-2013-exploit-simil,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1309.4168M",
  archivePrefix= "arXiv",
  author       = "{Mikolov}, T. and {Le}, Q.~V. and {Sutskever}, I.",
  eprint       = "1309.4168",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = sep,
  primaryClass = "cs.CL",
  title        = "{Exploiting Similarities Among Languages for Machine
                  Translation}",
  year         = 2013
}

@ARTICLE{deng-2016-image-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160904938D",
  archivePrefix= "arXiv",
  author       = "{Deng}, Y. and {Kanervisto}, A. and {Ling}, J. and {Rush},
                  A.~M.",
  eprint       = "1609.04938",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Computation and Language, Computer Science
                  - Learning, Computer Science - Neural and Evolutionary
                  Computing",
  month        = sep,
  primaryClass = "cs.CV",
  title        = "{Image-To-Markup Generation With Coarse-To-Fine Attention}",
  year         = 2016
}

@inproceedings{minka-2001-automatic-choice,
  title        = "Automatic choice of dimensionality for PCA",
  author       = "Minka, Thomas P",
  booktitle    = "Advances in neural information processing systems",
  pages        = "598-604",
  year         = 2001
}

@ARTICLE{le-2014-distr-repres-senten-docum,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1405.4053L",
  archivePrefix= "arXiv",
  author       = "{Le}, Q.~V. and {Mikolov}, T.",
  eprint       = "1405.4053",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning",
  month        = may,
  primaryClass = "cs.CL",
  title        = "{Distributed Representations of Sentences and Documents}",
  year         = 2014
}

@article{arora-2016-simple-tough,
  title        = {A simple but tough-to-beat baseline for sentence embeddings},
  author       = {Arora, Sanjeev and Liang, Yingyu and Ma, Tengyu},
  year         = 2016,
  journal      = ""
}

@ARTICLE{bojanowski-2016-fasttext,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160704606B",
  archivePrefix= "arXiv",
  author       = "{Bojanowski}, P. and {Grave}, E. and {Joulin}, A. and
                  {Mikolov}, T.",
  eprint       = "1607.04606",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jul,
  primaryClass = "cs.CL",
  title        = "{Enriching Word Vectors With Subword Information}",
  year         = 2016
}

@ARTICLE{srivastava-2015-highw-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150500387S",
  archivePrefix= "arXiv",
  author       = "{Srivastava}, R.~K. and {Greff}, K. and {Schmidhuber}, J.",
  eprint       = "1505.00387",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing, 68T01, I.2.6, G.1.6",
  month        = may,
  primaryClass = "cs.LG",
  title        = "{Highway Networks}",
  year         = 2015
}

@ARTICLE{kalchbrenner-2014-convol-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1404.2188K",
  archivePrefix= "arXiv",
  author       = "{Kalchbrenner}, N. and {Grefenstette}, E. and {Blunsom}, P.",
  eprint       = "1404.2188",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = apr,
  primaryClass = "cs.CL",
  title        = "{A Convolutional Neural Network for Modelling Sentences}",
  year         = 2014
}

@InProceedings{matt-2015-word-embed,
  title        = "From Word Embeddings To Document Distances",
  author       = "Matt Kusner and Yu Sun and Nicholas Kolkin and Kilian
                  Weinberger",
  booktitle    = "Proceedings of the 32nd International Conference on Machine
                  Learning",
  pages        = "957-966",
  year         = 2015,
  editor       = "Francis Bach and David Blei",
  volume       = 37,
  series       = "Proceedings of Machine Learning Research",
  address      = "Lille, France",
  month        = "07--09 Jul",
  publisher    = "PMLR",
  pdf          = "http://proceedings.mlr.press/v37/kusnerb15.pdf",
  url          = "http://proceedings.mlr.press/v37/kusnerb15.html",
  abstract     = "We present the Word Mover’s Distance (WMD), a novel distance
                  function between text documents. Our work is based on recent
                  results in word embeddings that learn semantically meaningful
                  representations for words from local co-occurrences in
                  sentences. The WMD distance measures the dissimilarity
                  between two text documents as the minimum amount of distance
                  that the embedded words of one document need to ``travel'' to
                  reach the embedded words of another document. We show that
                  this distance metric can be cast as an instance of the Earth
                  Mover’s Distance, a well studied transportation problem for
                  which several highly efficient solvers have been
                  developed. Our metric has no hyperparameters and is
                  straight-forward to implement. Further, we demonstrate on
                  eight real world document classification data sets, in
                  comparison with seven state-of-the-art baselines, that the
                  WMD metric leads to unprecedented low k-nearest neighbor
                  document classification error rates."
}

@ARTICLE{brokos-2016-using-centr,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160803905B",
  archivePrefix= "arXiv",
  author       = "{Brokos}, G.-I. and {Malakasiotis}, P. and {Androutsopoulos},
                  I.",
  eprint       = "1608.03905",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Information Retrieval",
  month        = aug,
  primaryClass = "cs.IR",
  title        = "{Using Centroids of Word Embeddings and Word Mover's Distance
                  for Biomedical Document Retrieval in Question Answering}",
  year         = 2016
}

@ARTICLE{dai-2015-docum-embed,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150707998D",
  archivePrefix= "arXiv",
  author       = "{Dai}, A.~M. and {Olah}, C. and {Le}, Q.~V.",
  eprint       = "1507.07998",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning",
  month        = jul,
  primaryClass = "cs.CL",
  title        = "{Document Embedding With Paragraph Vectors}",
  year         = 2015
}

@ARTICLE{lau-2016-empir-evaluat,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160705368L",
  archivePrefix= "arXiv",
  author       = "{Lau}, J.~H. and {Baldwin}, T.",
  eprint       = "1607.05368",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jul,
  primaryClass = "cs.CL",
  title        = "{An Empirical Evaluation of Doc2vec With Practical Insights
                  Into Document Embedding Generation}",
  year         = 2016
}

@inproceedings{polajnar-2015-exploration-discourse,
  title        = "An exploration of discourse-based sentence spaces for
                  compositional distributional semantics",
  author       = "Polajnar, Tamara and Rimell, Laura and Clark, Stephen",
  booktitle    = "Workshop on Linking Models of Lexical, Sentential and
                  Discourse-level Semantics (LSDSem)",
  pages        = 1,
  year         = 2015
}

@inproceedings{socher-2011-semi-supervised,
  title        = "Semi-supervised recursive autoencoders for predicting
                  sentiment distributions",
  author       = "Socher, Richard and Pennington, Jeffrey and Huang, Eric H and
                  Ng, Andrew Y and Manning, Christopher D",
  booktitle    = "Proceedings of the conference on empirical methods in natural
                  language processing",
  pages        = "151-161",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@article{hodosh-2013-framing-image,
  title        = "Framing image description as a ranking task: Data, models and
                  evaluation metrics",
  author       = "Hodosh, Micah and Young, Peter and Hockenmaier, Julia",
  journal      = "Journal of Artificial Intelligence Research",
  volume       = 47,
  pages        = "853-899",
  year         = 2013
}

@inproceedings{shen-2014-latent-semantic,
  title        = "A latent semantic model with convolutional-pooling structure
                  for information retrieval",
  author       = "Shen, Yelong and He, Xiaodong and Gao, Jianfeng and Deng, Li
                  and Mesnil, Gr{\'e}goire",
  booktitle    = "Proceedings of the 23rd ACM International Conference on
                  Conference on Information and Knowledge Management",
  pages        = "101-110",
  year         = 2014,
  organization = "ACM"
}

@ARTICLE{xiong-2016-dynam-memor,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160301417X",
  archivePrefix= "arXiv",
  author       = "{Xiong}, C. and {Merity}, S. and {Socher}, R.",
  eprint       = "1603.01417",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Computation and Language, Computer Science
                  - Computer Vision and Pattern Recognition",
  month        = mar,
  title        = "{Dynamic Memory Networks for Visual and Textual Question
                  Answering}",
  year         = 2016
}

@ARTICLE{zeng-2016-effic-summar,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161103382Z",
  archivePrefix= "arXiv",
  author       = "{Zeng}, W. and {Luo}, W. and {Fidler}, S. and {Urtasun}, R.",
  eprint       = "1611.03382",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{Efficient Summarization With Read-Again and Copy Mechanism}",
  year         = 2016
}

@ARTICLE{lai-2015-how-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150705523L",
  archivePrefix= "arXiv",
  author       = "{Lai}, S. and {Liu}, K. and {Xu}, L. and {Zhao}, J.",
  eprint       = "1507.05523",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jul,
  primaryClass = "cs.CL",
  title        = "{How To Generate a Good Word Embedding?}",
  year         = 2015
}

@inproceedings{chen-2015-revisit-word,
  title        = "Revisiting Word Embedding for Contrasting Meaning",
  author       = "Zhigang Chen and Wei Lin and Qian Chen and Xiaoping Chen and
                  Si Wei and Hui Jiang and Xiao-Dan Zhu",
  booktitle    = "ACL",
  year         = 2015
}

@inproceedings{lazaridou-2015-hubness-pollution,
  title        = "Hubness and Pollution: Delving into Cross-Space Mapping for
                  Zero-Shot Learning",
  author       = "Angeliki Lazaridou and Georgiana Dinu and Marco Baroni",
  booktitle    = "ACL",
  year         = 2015
}

@ARTICLE{yin-2017-compar-study,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170201923Y",
  archivePrefix= "arXiv",
  author       = "{Yin}, W. and {Kann}, K. and {Yu}, M. and {Sch{\"u}tze}, H.",
  eprint       = "1702.01923",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = feb,
  primaryClass = "cs.CL",
  title        = "{Comparative Study of Cnn and Rnn for Natural Language
                  Processing}",
  year         = 2017
}

@ARTICLE{zhang-2015-sensit-analy,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151003820Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, Y. and {Wallace}, B.",
  eprint       = "1510.03820",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing",
  month        = oct,
  primaryClass = "cs.CL",
  title        = "{A Sensitivity Analysis of (and Practitioners' Guide to)
                  Convolutional Neural Networks for Sentence Classification}",
  year         = 2015
}

@ARTICLE{johnson-2014-effec-use,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.1058J",
  archivePrefix= "arXiv",
  author       = "{Johnson}, R. and {Zhang}, T.",
  eprint       = "1412.1058",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Statistics - Machine Learning",
  month        = dec,
  primaryClass = "cs.CL",
  title        = "{Effective Use of Word Order for Text Categorization With
                  Convolutional Neural Networks}",
  year         = 2014
}

@ARTICLE{johnson-2015-semi-super,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150401255J",
  archivePrefix= "arXiv",
  author       = "{Johnson}, R. and {Zhang}, T.",
  eprint       = "1504.01255",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Computation
                  and Language, Computer Science - Learning",
  month        = apr,
  primaryClass = "stat.ML",
  title        = "{Semi-Supervised Convolutional Neural Networks for Text
                  Categorization Via Region Embedding}",
  year         = 2015
}

@ARTICLE{zhang-2015-charac-level,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150901626Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, X. and {Zhao}, J. and {LeCun}, Y.",
  eprint       = "1509.01626",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = sep,
  primaryClass = "cs.LG",
  title        = "{Character-Level Convolutional Networks for Text
                  Classification}",
  year         = 2015
}

@ARTICLE{zhang-2015-text-under-from-scrat,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150201710Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, X. and {LeCun}, Y.",
  eprint       = "1502.01710",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Text Understanding From Scratch}",
  year         = 2015
}

@article{schuster-1997-bidirectional-recurrent,
  title        = "Bidirectional recurrent neural networks",
  author       = "Schuster, Mike and Paliwal, Kuldip K",
  journal      = "IEEE Transactions on Signal Processing",
  volume       = 45,
  number       = 11,
  pages        = "2673-2681",
  year         = 1997,
  publisher    = "IEEE"
}

@article{chen-2015-event-extract,
  title        = "Event Extraction via Dynamic Multi-Pooling Convolutional
                  Neural Networks",
  author       = "Chen, Yubo and Xu, Liheng and Liu, Kang and Zeng, Daojian and
                  Zhao, Jun and others",
  year         = 2015,
  journal      = "''"
}

@ARTICLE{bengio-2012-repres-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1206.5538B",
  archivePrefix= "arXiv",
  author       = "{Bengio}, Y. and {Courville}, A. and {Vincent}, P.",
  eprint       = "1206.5538",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{Representation Learning: A Review and New Perspectives}",
  year         = 2012
}

@inproceedings{le-2011-ica-recons,
  title        = "ICA with reconstruction cost for efficient overcomplete
                  feature learning",
  author       = "Le, Quoc V and Karpenko, Alexandre and Ngiam, Jiquan and Ng,
                  Andrew Y",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "1017-1025",
  year         = 2011
}

@ARTICLE{goodfellow-2013-maxout-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1302.4389G",
  archivePrefix= "arXiv",
  author       = "{Goodfellow}, I.~J. and {Warde-Farley}, D. and {Mirza},
                  M. and {Courville}, A. and {Bengio}, Y.",
  eprint       = "1302.4389",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = feb,
  primaryClass = "stat.ML",
  title        = "{Maxout Networks}",
  year         = 2013
}

@ARTICLE{he-2015-delvin-deep-into-rectif,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150201852H",
  archivePrefix= "arXiv",
  author       = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
  eprint       = "1502.01852",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Learning",
  month        = feb,
  primaryClass = "cs.CV",
  title        = "{Delving Deep Into Rectifiers: Surpassing Human-Level
                  Performance on Imagenet Classification}",
  year         = 2015
}

@ARTICLE{schmidhuber-2014-deep-learn-neural-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1404.7828S",
  archivePrefix= "arXiv",
  author       = "{Schmidhuber}, J.",
  eprint       = "1404.7828",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Learning",
  month        = apr,
  title        = "{Deep Learning in Neural Networks: An Overview}",
  year         = 2014
}

@article{dahl-2012-context-depend,
  title        = "Context-dependent pre-trained deep neural networks for
                  large-vocabulary speech recognition",
  author       = "Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex",
  journal      = "IEEE Transactions on audio, speech, and language processing",
  volume       = 20,
  number       = 1,
  pages        = "30-42",
  year         = 2012,
  publisher    = "IEEE"
}

@ARTICLE{romero-2014-fitnet,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.6550R",
  archivePrefix= "arXiv",
  author       = "{Romero}, A. and {Ballas}, N. and {Ebrahimi Kahou}, S. and
                  {Chassang}, A. and {Gatta}, C. and {Bengio}, Y.",
  eprint       = "1412.6550",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{FitNets: Hints for Thin Deep Nets}",
  year         = 2014
}

@ARTICLE{srivastava-2015-train-very-deep-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150706228S",
  archivePrefix= "arXiv",
  author       = "{Srivastava}, R.~K. and {Greff}, K. and {Schmidhuber}, J.",
  eprint       = "1507.06228",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing, 68T01, I.2.6, G.1.6",
  month        = jul,
  primaryClass = "cs.LG",
  title        = "{Training Very Deep Networks}",
  year         = 2015
}

@ARTICLE{huang-2016-densel-connec-convol-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160806993H",
  archivePrefix= "arXiv",
  author       = "{Huang}, G. and {Liu}, Z. and {Weinberger}, K.~Q. and {van
                  der Maaten}, L.",
  eprint       = "1608.06993",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning",
  month        = aug,
  primaryClass = "cs.CV",
  title        = "{Densely Connected Convolutional Networks}",
  year         = 2016
}

@ARTICLE{he-2016-ident-mappin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160305027H",
  archivePrefix= "arXiv",
  author       = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
  eprint       = "1603.05027",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning",
  month        = mar,
  primaryClass = "cs.CV",
  title        = "{Identity Mappings in Deep Residual Networks}",
  year         = 2016
}

@ARTICLE{veit-2016-resid-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160506431V",
  archivePrefix= "arXiv",
  author       = "{Veit}, A. and {Wilber}, M. and {Belongie}, S.",
  eprint       = "1605.06431",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Learning, Computer Science - Neural and Evolutionary
                  Computing",
  month        = may,
  primaryClass = "cs.CV",
  title        = "{Residual Networks Behave Like Ensembles of Relatively
                  Shallow Networks}",
  year         = 2016
}

@ARTICLE{zagoruyko-2016-wide-resid-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160507146Z",
  archivePrefix= "arXiv",
  author       = "{Zagoruyko}, S. and {Komodakis}, N.",
  eprint       = "1605.07146",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = may,
  primaryClass = "cs.CV",
  title        = "{Wide Residual Networks}",
  year         = 2016
}

@ARTICLE{telgarsky-2016-benef-depth-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160204485T",
  archivePrefix= "arXiv",
  author       = "{Telgarsky}, M.",
  eprint       = "1602.04485",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Benefits of Depth in Neural networks}",
  year         = 2016
}

@ARTICLE{huang-2016-deep-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160309382H",
  archivePrefix= "arXiv",
  author       = "{Huang}, G. and {Sun}, Y. and {Liu}, Z. and {Sedra}, D. and
                  {Weinberger}, K.",
  eprint       = "1603.09382",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computer
                  Vision and Pattern Recognition, Computer Science - Neural and
                  Evolutionary Computing",
  month        = mar,
  primaryClass = "cs.LG",
  title        = "{Deep Networks With Stochastic Depth}",
  year         = 2016
}

@ARTICLE{eldan-2015-power-depth,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151203965E",
  archivePrefix= "arXiv",
  author       = "{Eldan}, R. and {Shamir}, O.",
  eprint       = "1512.03965",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing, Statistics - Machine Learning",
  month        = dec,
  primaryClass = "cs.LG",
  title        = "{The Power of Depth for Feedforward Neural Networks}",
  year         = 2015
}

@ARTICLE{liao-2016-bridg-gaps,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160403640L",
  archivePrefix= "arXiv",
  author       = "{Liao}, Q. and {Poggio}, T.",
  eprint       = "1604.03640",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = apr,
  primaryClass = "cs.LG",
  title        = "{Bridging the Gaps Between Residual Learning, Recurrent
                  Neural Networks and Visual Cortex}",
  year         = 2016
}

@ARTICLE{greff-2016-highw-resid,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161207771G",
  archivePrefix= "arXiv",
  author       = "{Greff}, K. and {Srivastava}, R.~K. and {Schmidhuber}, J.",
  eprint       = "1612.07771",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Learning, I.2.6, I.5.1",
  month        = dec,
  title        = "{Highway and Residual Networks Learn Unrolled Iterative
                  Estimation}",
  year         = 2016
}

@ARTICLE{xie-2016-aggreg-resid,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161105431X",
  archivePrefix= "arXiv",
  author       = "{Xie}, S. and {Girshick}, R. and {Doll{\'a}r}, P. and {Tu},
                  Z. and {He}, K.",
  eprint       = "1611.05431",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = nov,
  primaryClass = "cs.CV",
  title        = "{Aggregated Residual Transformations for Deep Neural
                  Networks}",
  year         = 2016
}

@ARTICLE{alain-2016-under-inter,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161001644A",
  archivePrefix= "arXiv",
  author       = "{Alain}, G. and {Bengio}, Y.",
  eprint       = "1610.01644",
  journal      = "ArXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = oct,
  primaryClass = "stat.ML",
  title        = "{Understanding Intermediate Layers Using Linear Classifier
                  probes}",
  year         = 2016
}

@ARTICLE{yosinski-2014-how-trans,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1411.1792Y",
  archivePrefix= "arXiv",
  author       = "{Yosinski}, J. and {Clune}, J. and {Bengio}, Y. and {Lipson},
                  H.",
  eprint       = "1411.1792",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{How Transferable Are Features in Deep Neural networks?}",
  year         = 2014
}

@inproceedings{levy-2014-neural-word,
  title        = "Neural Word Embedding as Implicit Matrix Factorization",
  author       = "Levy, Omer and Goldberg, Yoav",
  booktitle    = "Advances in Neural Information Processing Systems 27",
  editor       = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
                  and K. Q. Weinberger",
  pages        = "2177-2185",
  year         = 2014,
  publisher    = "Curran Associates, Inc.",
  url          =
  "http://papers.nips.cc/paper/5477-neural-word-embedding-as-implicit-matrix-factorization.pdf"
}

@ARTICLE{dyer-2014-notes-noise,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.8251D",
  archivePrefix= "arXiv",
  author       = "{Dyer}, C.",
  eprint       = "1410.8251",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = oct,
  primaryClass = "cs.LG",
  title        = "{Notes on Noise Contrastive Estimation and Negative
  Sampling}",
  year         = 2014
}

@inproceedings{levy-2014-ling-regul,
  title        = "Linguistic Regularities in Sparse and Explicit Word
                  Representations",
  author       = "Levy, Omer and Goldberg, Yoav and Ramat-Gan, Israel",
  booktitle    = "CoNLL",
  pages        = "171-180",
  year         = 2014
}

@ARTICLE{arora-2015-rand-walk,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150203520A",
  archivePrefix= "arXiv",
  author       = "{Arora}, S. and {Li}, Y. and {Liang}, Y. and {Ma}, T. and
                  {Risteski}, A.",
  eprint       = "1502.03520",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{RAND-WALK: A Latent Variable Model Approach To Word
                  Embeddings}",
  year         = 2015
}

@ARTICLE{saxe-2013-exact-solut,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.6120S",
  archivePrefix= "arXiv",
  author       = "{Saxe}, A.~M. and {McClelland}, J.~L. and {Ganguli}, S.",
  eprint       = "1312.6120",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Condensed Matter - Disordered Systems and Neural Networks,
                  Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Quantitative Biology - Neurons
                  and Cognition, Statistics - Machine Learning",
  month        = dec,
  title        = "{Exact Solutions To the Nonlinear Dynamics of Learning in
                  Deep Linear Neural networks}",
  year         = 2013
}

@ARTICLE{mishkin-2015-all-you,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106422M",
  archivePrefix= "arXiv",
  author       = "{Mishkin}, D. and {Matas}, J.",
  eprint       = "1511.06422",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{All You Need Is a Good init}",
  year         = 2015
}

@ARTICLE{kraehenbuehl-2015-data-depen,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106856K",
  archivePrefix= "arXiv",
  author       = "{Kr{\"a}henb{\"u}hl}, P. and {Doersch}, C. and {Donahue},
                  J. and {Darrell}, T.",
  eprint       = "1511.06856",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.CV",
  title        = "{Data-Dependent Initializations of Convolutional Neural
                  Networks}",
  year         = 2015
}

@ARTICLE{britz-2017-massiv-explor,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170303906B",
  archivePrefix= "arXiv",
  author       = "{Britz}, D. and {Goldie}, A. and {Luong}, M.-T. and {Le}, Q.",
  eprint       = "1703.03906",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Massive Exploration of Neural Machine Translation
                  Architectures}",
  year         = 2017
}

@ARTICLE{neubig-2017-neural-machin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170301619N",
  archivePrefix= "arXiv",
  author       = "{Neubig}, G.",
  eprint       = "1703.01619",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Statistics - Machine Learning",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Neural Machine Translation and Sequence-To-Sequence Models:
                  A Tutorial}",
  year         = 2017
}

@ARTICLE{wu-2016-googl-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160908144W",
  archivePrefix= "arXiv",
  author       = "{Wu}, Y. and {Schuster}, M. and {Chen}, Z. and {Le},
                  Q.~V. and {Norouzi}, M. and {Macherey}, W. and {Krikun},
                  M. and {Cao}, Y. and {Gao}, Q. and {Macherey}, K. and
                  {Klingner}, J. and {Shah}, A. and {Johnson}, M. and {Liu},
                  X. and {Kaiser}, {\L}. and {Gouws}, S. and {Kato}, Y. and
                  {Kudo}, T. and {Kazawa}, H. and {Stevens}, K. and {Kurian},
                  G. and {Patil}, N. and {Wang}, W. and {Young}, C. and
                  {Smith}, J. and {Riesa}, J. and {Rudnick}, A. and {Vinyals},
                  O. and {Corrado}, G. and {Hughes}, M. and {Dean}, J.",
  eprint       = "1609.08144",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning",
  month        = sep,
  primaryClass = "cs.CL",
  title        = "{Google's Neural Machine Translation System: Bridging the Gap
                  Between Human and Machine Translation}",
  year         = 2016
}

@ARTICLE{fang-2014-from-caption,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1411.4952F",
  archivePrefix= "arXiv",
  author       = "{Fang}, H. and {Gupta}, S. and {Iandola}, F. and
                  {Srivastava}, R. and {Deng}, L. and {Doll{\'a}r}, P. and
                  {Gao}, J. and {He}, X. and {Mitchell}, M. and {Platt},
                  J.~C. and {Zitnick}, C.~L. and {Zweig}, G.",
  eprint       = "1411.4952",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Computation and Language",
  month        = nov,
  primaryClass = "cs.CV",
  title        = "{From Captions To Visual Concepts and Back}",
  year         = 2014
}

@ARTICLE{ranzato-2015-mixer,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106732R",
  archivePrefix= "arXiv",
  author       = "{Ranzato}, M. and {Chopra}, S. and {Auli}, M. and {Zaremba},
                  W.",
  eprint       = "1511.06732",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Sequence Level Training With Recurrent Neural Networks}",
  year         = 2015
}

@ARTICLE{graves-2012-sequen-trans,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1211.3711G",
  archivePrefix= "arXiv",
  author       = "{Graves}, A.",
  eprint       = "1211.3711",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Learning, Statistics - Machine Learning",
  month        = nov,
  title        = "{Sequence Transduction With Recurrent Neural Networks}",
  year         = 2012
}

@ARTICLE{zhang-2017-towar-end,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170102720Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, Y. and {Pezeshki}, M. and {Brakel}, P. and {Zhang},
                  S. and {Yoshua Bengio}, C.~L. and {Courville}, A.",
  eprint       = "1701.02720",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Statistics - Machine Learning",
  month        = jan,
  primaryClass = "cs.CL",
  title        = "{Towards End-To-End Speech Recognition With Deep
                  Convolutional Neural Networks}",
  year         = 2017
}

@ARTICLE{bengio-2012-pract-recom,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1206.5533B",
  archivePrefix= "arXiv",
  author       = "{Bengio}, Y.",
  eprint       = "1206.5533",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{Practical Recommendations for Gradient-Based Training of
                  Deep architectures}",
  year         = 2012
}

@ARTICLE{pascanu-2012-diffic-train,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1211.5063P",
  archivePrefix= "arXiv",
  author       = "{Pascanu}, R. and {Mikolov}, T. and {Bengio}, Y.",
  eprint       = "1211.5063",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{On the Difficulty of Training Recurrent Neural Networks}",
  year         = 2012
}

@ARTICLE{yosinski-2015-under-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150606579Y",
  archivePrefix= "arXiv",
  author       = "{Yosinski}, J. and {Clune}, J. and {Nguyen}, A. and {Fuchs},
                  T. and {Lipson}, H.",
  eprint       = "1506.06579",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = jun,
  primaryClass = "cs.CV",
  title        = "{Understanding Neural Networks Through Deep Visualization}",
  year         = 2015
}

@ARTICLE{vaswani-2017-transformer,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170603762V",
  archivePrefix= "arXiv",
  author       = "{Vaswani}, A. and {Shazeer}, N. and {Parmar}, N. and
                  {Uszkoreit}, J. and {Jones}, L. and {Gomez}, A.~N. and
                  {Kaiser}, L. and {Polosukhin}, I.",
  eprint       = "1706.03762",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Attention Is All You Need}",
  year         = 2017
}

@ARTICLE{semeniuta-2016-recurrent-dropout,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160305118S",
  archivePrefix= "arXiv",
  author       = "{Semeniuta}, S. and {Severyn}, A. and {Barth}, E.",
  eprint       = "1603.05118",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Recurrent Dropout Without Memory Loss}",
  year         = 2016
}

@ARTICLE{pascanu-2013-how-to,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.6026P",
  archivePrefix= "arXiv",
  author       = "{Pascanu}, R. and {Gulcehre}, C. and {Cho}, K. and {Bengio},
                  Y.",
  eprint       = "1312.6026",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Learning, Statistics - Machine Learning",
  month        = dec,
  title        = "{How To Construct Deep Recurrent Neural Networks}",
  year         = 2013
}

@ARTICLE{luong-2014-addres-rare,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.8206L",
  archivePrefix= "arXiv",
  author       = "{Luong}, M.-T. and {Sutskever}, I. and {Le}, Q.~V. and
                  {Vinyals}, O. and {Zaremba}, W.",
  eprint       = "1410.8206",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing",
  month        = oct,
  primaryClass = "cs.CL",
  title        = "{Addressing the Rare Word Problem in Neural Machine
                  Translation}",
  year         = 2014
}

@ARTICLE{luo-2017-cosin-normal,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170205870L",
  archivePrefix= "arXiv",
  author       = "{Luo}, C. and {Zhan}, J. and {Wang}, L. and {Yang}, Q.",
  eprint       = "1702.05870",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Artificial
                  Intelligence, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.LG",
  title        = "{Cosine Normalization: Using Cosine Similarity Instead of Dot
                  Product in Neural Networks}",
  year         = 2017
}

@ARTICLE{kaiser-2017-one-model,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170605137K",
  archivePrefix= "arXiv",
  author       = "{Kaiser}, L. and {Gomez}, A.~N. and {Shazeer}, N. and
                  {Vaswani}, A. and {Parmar}, N. and {Jones}, L. and
                  {Uszkoreit}, J.",
  eprint       = "1706.05137",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Statistics - Machine Learning",
  month        = jun,
  primaryClass = "cs.LG",
  title        = "{One Model To Learn Them All}",
  year         = 2017
}

@ARTICLE{nguyen-2014-deep-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.1897N",
  archivePrefix= "arXiv",
  author       = "{Nguyen}, A. and {Yosinski}, J. and {Clune}, J.",
  eprint       = "1412.1897",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Neural and Evolutionary Computing",
  month        = dec,
  primaryClass = "cs.CV",
  title        = "{Deep Neural Networks Are Easily Fooled: High Confidence
                  Predictions for Unrecognizable Images}",
  year         = 2014
}

@ARTICLE{press-2016-using-output,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160805859P",
  archivePrefix= "arXiv",
  author       = "{Press}, O. and {Wolf}, L.",
  eprint       = "1608.05859",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL",
  title        = "{Using the Output Embedding To Improve Language Models}",
  year         = 2016
}

@misc{hochreiter-2001-gradient-flow,
  title        = "Gradient flow in recurrent nets: the difficulty of learning
                  long-term dependencies",
  author       = "Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo and
                  Schmidhuber, J{\"u}rgen and others",
  year         = 2001,
  publisher    = "A field guide to dynamical recurrent neural networks. IEEE
                  Press"
}

@ARTICLE{szegedy-2016-incep-v4,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160207261S",
  archivePrefix= "arXiv",
  author       = "{Szegedy}, C. and {Ioffe}, S. and {Vanhoucke}, V. and
                  {Alemi}, A.",
  eprint       = "1602.07261",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = feb,
  primaryClass = "cs.CV",
  title        = "{Inception-V4, Inception-Resnet and the Impact of Residual
                  Connections on Learning}",
  year         = 2016
}

@ARTICLE{lin-2017-struc-self,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170303130L",
  archivePrefix= "arXiv",
  author       = "{Lin}, Z. and {Feng}, M. and {Nogueira dos Santos}, C. and
                  {Yu}, M. and {Xiang}, B. and {Zhou}, B. and {Bengio}, Y.",
  eprint       = "1703.03130",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning, Computer Science -
  Neural and Evolutionary Computing",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{A Structured Self-Attentive Sentence Embedding}",
  year         = 2017
}

@ARTICLE{memisevic-2011-learn-to-relat-images,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2011arXiv1110.0107M",
  archivePrefix= "arXiv",
  author       = "{Memisevic}, R.",
  eprint       = "1110.0107",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Artificial Intelligence, Nonlinear
                  Sciences - Adaptation and Self-Organizing Systems, Statistics
                  - Machine Learning",
  month        = oct,
  primaryClass = "cs.CV",
  title        = "{Learning To Relate Images: Mapping Units, Complex Cells and
                  Simultaneous eigenspaces}",
  year         = 2011
}

@ARTICLE{cheng-2016-long-short,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160106733C",
  archivePrefix= "arXiv",
  author       = "{Cheng}, J. and {Dong}, L. and {Lapata}, M.",
  eprint       = "1601.06733",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = jan,
  primaryClass = "cs.CL",
  title        = "{Long Short-Term Memory-Networks for Machine Reading}",
  year         = 2016
}

@ARTICLE{paulus-2017-deep-reinf,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170504304P",
  archivePrefix= "arXiv",
  author       = "{Paulus}, R. and {Xiong}, C. and {Socher}, R.",
  eprint       = "1705.04304",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = may,
  primaryClass = "cs.CL",
  title        = "{A Deep Reinforced Model for Abstractive Summarization}",
  year         = 2017
}

@ARTICLE{shen-2016-reason,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160905284S",
  archivePrefix= "arXiv",
  author       = "{Shen}, Y. and {Huang}, P.-S. and {Gao}, J. and {Chen}, W.",
  eprint       = "1609.05284",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = sep,
  primaryClass = "cs.LG",
  title        = "{ReasoNet: Learning To Stop Reading in Machine
  Comprehension}",
  year         = 2016
}

@ARTICLE{golub-2017-two-stage,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170609789G",
  archivePrefix= "arXiv",
  author       = "{Golub}, D. and {Huang}, P.-S. and {He}, X. and {Deng}, L.",
  eprint       = "1706.09789",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Two-Stage Synthesis Networks for Transfer Learning in
                  Machine Comprehension}",
  year         = 2017
}

@ARTICLE{miller-2016-key-value,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160603126M",
  archivePrefix= "arXiv",
  author       = "{Miller}, A. and {Fisch}, A. and {Dodge}, J. and {Karimi},
                  A.-H. and {Bordes}, A. and {Weston}, J.",
  eprint       = "1606.03126",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{Key-Value Memory Networks for Directly Reading Documents}",
  year         = 2016
}

@ARTICLE{zhang-2016-quest-answer,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160600979Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, Y. and {Liu}, K. and {He}, S. and {Ji}, G. and
                  {Liu}, Z. and {Wu}, H. and {Zhao}, J.",
  eprint       = "1606.00979",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Information Retrieval, Computer Science -
                  Artificial Intelligence, Computer Science - Computation and
                  Language, Computer Science - Neural and Evolutionary
                  Computing",
  month        = jun,
  primaryClass = "cs.IR",
  title        = "{Question Answering Over Knowledge Base With Neural Attention
                  Combining Global Knowledge Information}",
  year         = 2016
}

@ARTICLE{nguyen-2016-ms-marco,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161109268N",
  archivePrefix= "arXiv",
  author       = "{Nguyen}, T. and {Rosenberg}, M. and {Song}, X. and {Gao},
                  J. and {Tiwary}, S. and {Majumder}, R. and {Deng}, L.",
  eprint       = "1611.09268",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Information Retrieval",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{MS Marco: A Human Generated Machine Reading Comprehension
                  Dataset}",
  year         = 2016
}

@ARTICLE{zhang-2017-inter-convol-neural-networ,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171000935Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, Q. and {Nian Wu}, Y. and {Zhu}, S.-C.",
  eprint       = "1710.00935",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = oct,
  primaryClass = "cs.CV",
  title        = "{Interpretable Convolutional Neural Networks}",
  year         = 2017
}

@ARTICLE{mnih-2012-fast-simpl,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1206.6426M",
  archivePrefix= "arXiv",
  author       = "{Mnih}, A. and {Whye Teh}, Y.",
  eprint       = "1206.6426",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{A Fast and Simple Algorithm for Training Neural
                  Probabilistic Language Models}",
  year         = 2012
}

@ARTICLE{pagliardini-2017-unsup-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170302507P",
  archivePrefix= "arXiv",
  author       = "{Pagliardini}, M. and {Gupta}, P. and {Jaggi}, M.",
  eprint       = "1703.02507",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Information Retrieval, I.2.7",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Unsupervised Learning of Sentence Embeddings Using
                  Compositional N-Gram Features}",
  year         = 2017
}

@ARTICLE{palangi-2015-deep-senten,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150206922P",
  archivePrefix= "arXiv",
  author       = "{Palangi}, H. and {Deng}, L. and {Shen}, Y. and {Gao}, J. and
                  {He}, X. and {Chen}, J. and {Song}, X. and {Ward}, R.",
  eprint       = "1502.06922",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Information Retrieval, Computer Science - Learning, Computer Science -
  Neural and Evolutionary Computing",
  month        = feb,
  primaryClass = "cs.CL",
  title        = "{Deep Sentence Embedding Using Long Short-Term Memory
                  Networks: Analysis and Application To Information Retrieval}",
  year         = 2015
}

@ARTICLE{maillard-2017-joint-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170509189M",
  archivePrefix= "arXiv",
  author       = "{Maillard}, J. and {Clark}, S. and {Yogatama}, D.",
  eprint       = "1705.09189",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = may,
  primaryClass = "cs.CL",
  title        = "{Jointly Learning Sentence Embeddings and Syntax With
                  Unsupervised Tree-LSTMs}",
  year         = 2017
}

@ARTICLE{dai-2015-semi-super-sequen-learn,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151101432D",
  archivePrefix= "arXiv",
  author       = "{Dai}, A.~M. and {Le}, Q.~V.",
  eprint       = "1511.01432",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Semi-Supervised Sequence Learning}",
  year         = 2015
}

@ARTICLE{luong-2015-multi-task-seq2seq,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106114L",
  archivePrefix= "arXiv",
  author       = "{Luong}, M.-T. and {Le}, Q.~V. and {Sutskever}, I. and
                  {Vinyals}, O. and {Kaiser}, L.",
  eprint       = "1511.06114",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Statistics - Machine Learning",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Multi-Task Sequence To Sequence Learning}",
  year         = 2015
}

@ARTICLE{li-2015-hierar-neural,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150601057L",
  archivePrefix= "arXiv",
  author       = "{Li}, J. and {Luong}, M.-T. and {Jurafsky}, D.",
  eprint       = "1506.01057",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{A Hierarchical Neural Autoencoder for Paragraphs and
                  Documents}",
  year         = 2015
}

@ARTICLE{hill-2016-learn-distr,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160203483H",
  archivePrefix= "arXiv",
  author       = "{Hill}, F. and {Cho}, K. and {Korhonen}, A.",
  eprint       = "1602.03483",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = feb,
  primaryClass = "cs.CL",
  title        = "{Learning Distributed Representations of Sentences From
                  Unlabelled Data}",
  year         = 2016
}

@ARTICLE{wieting-2015-towar-univer,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151108198W",
  archivePrefix= "arXiv",
  author       = "{Wieting}, J. and {Bansal}, M. and {Gimpel}, K. and
                  {Livescu}, K.",
  eprint       = "1511.08198",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{Towards Universal Paraphrastic Sentence Embeddings}",
  year         = 2015
}

@ARTICLE{agrawal-2015-vqa,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150500468A",
  archivePrefix= "arXiv",
  author       = "{Agrawal}, A. and {Lu}, J. and {Antol}, S. and {Mitchell},
                  M. and {Zitnick}, C.~L. and {Batra}, D. and {Parikh}, D.",
  eprint       = "1505.00468",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Computer Vision and Pattern Recognition",
  month        = may,
  primaryClass = "cs.CL",
  title        = "{VQA: Visual Question Answering}",
  year         = 2015
}

@ARTICLE{zhang-2015-yin-yang,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151105099Z",
  archivePrefix= "arXiv",
  author       = "{Zhang}, P. and {Goyal}, Y. and {Summers-Stay}, D. and
                  {Batra}, D. and {Parikh}, D.",
  eprint       = "1511.05099",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Computer Vision and Pattern Recognition, Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.CL",
  title        = "{Yin and Yang: Balancing and Answering Binary Visual
                  Questions}",
  year         = 2015
}

@ARTICLE{goyal-2016-makin-v-vqa-matter,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161200837G",
  archivePrefix= "arXiv",
  author       = "{Goyal}, Y. and {Khot}, T. and {Summers-Stay}, D. and
                  {Batra}, D. and {Parikh}, D.",
  eprint       = "1612.00837",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Computation and Language, Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.CV",
  title        = "{Making the V in Vqa Matter: Elevating the Role of Image
                  Understanding in Visual Question Answering}",
  year         = 2016
}

@ARTICLE{bowman-2015-gener-senten,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106349B",
  archivePrefix= "arXiv",
  author       = "{Bowman}, S.~R. and {Vilnis}, L. and {Vinyals}, O. and {Dai},
                  A.~M. and {Jozefowicz}, R. and {Bengio}, S.",
  eprint       = "1511.06349",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = nov,
  primaryClass = "cs.LG",
  title        = "{Generating Sentences From a Continuous Space}",
  year         = 2015
}

@inproceedings{maas-2011-learning-word,
  title        = "Learning word vectors for sentiment analysis",
  author       = "Maas, Andrew L and Daly, Raymond E and Pham, Peter T and
                  Huang, Dan and Ng, Andrew Y and Potts, Christopher",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies-Volume
                  1",
  pages        = "142-150",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@inproceedings{ganitkevitch-2013-ppdb,
  title        = "{PPDB}: The Paraphrase Database",
  author       = "Ganitkevitch, Juri and {Van Durme}, Benjamin and
                  Callison-Burch, Chris",
  booktitle    = "Proceedings of NAACL-HLT",
  pages        = "758-764",
  month        = "June",
  year         = 2013,
  address      = "Atlanta, Georgia",
  publisher    = "Association for Computational Linguistics",
  url          = "http://cs.jhu.edu/~ccb/publications/ppdb.pdf"
}

@ARTICLE{mrk-2016-count-fittin,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160300892M",
  archivePrefix= "arXiv",
  author       = "{Mrk{\v s}i{\'c}}, N. and {S{\'e}aghdha}, D.~{\'O} and
                  {Thomson}, B. and {Ga{\v s}i{\'c}}, M. and {Rojas-Barahona},
                  L. and {Su}, P.-H. and {Vandyke}, D. and {Wen}, T.-H. and
                  {Young}, S.",
  eprint       = "1603.00892",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = mar,
  primaryClass = "cs.CL",
  title        = "{Counter-Fitting Word Vectors To Linguistic Constraints}",
  year         = 2016
}

@ARTICLE{hill-2014-simlex,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1408.3456H",
  archivePrefix= "arXiv",
  author       = "{Hill}, F. and {Reichart}, R. and {Korhonen}, A.",
  eprint       = "1408.3456",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL",
  title        = "{SimLex-999: Evaluating Semantic Models With (Genuine)
                  Similarity Estimation}",
  year         = 2014
}

@inproceedings{agirre-2009-study-similarity,
  title        = "A study on similarity and relatedness using distributional
                  and wordnet-based approaches",
  author       = "Agirre, Eneko and Alfonseca, Enrique and Hall, Keith and
                  Kravalova, Jana and Pa{\c{s}}ca, Marius and Soroa, Aitor",
  booktitle    = "Proceedings of Human Language Technologies: The 2009 Annual
                  Conference of the North American Chapter of the Association
                  for Computational Linguistics",
  pages        = "19-27",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@article{marelli-2014-sick-cure,
  title        = {A SICK cure for the evaluation of compositional distributional
                  semantic models},
  author       = {Marelli, M and Menini, S and Baroni, M and Bentivogli, L and
                  Bernardi, R and Zamparelli, R},
  year         = 2014,
  publisher    = {Citeseer},
  journal      = ""
}

@inproceedings{severyn-2015-learning-rank,
  title        = "Learning to rank short text pairs with convolutional deep
                  neural networks",
  author       = "Severyn, Aliaksei and Moschitti, Alessandro",
  booktitle    = "Proceedings of the 38th International ACM SIGIR Conference on
                  Research and Development in Information Retrieval",
  pages        = "373-382",
  year         = 2015,
  organization = "ACM"
}

@inproceedings{huang-2016-supervised-word,
  Author       = "Huang, Gao and Guo, Chuan and Kusner, Matt J and Sun, Yu and
                  Sha, Fei and Weinberger, Kilian Q",
  Booktitle    = "Advances in Neural Information Processing Systems 29",
  Editor       = "D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and
                  R. Garnett",
  Pages        = "4862-4870",
  Publisher    = "Curran Associates, Inc.",
  Title        = "Supervised Word Mover\textquotesingle s Distance",
  Url          =
  "http://papers.nips.cc/paper/6139-supervised-word-movers-distance.pdf",
  Year         = 2016,
  Bdsk-Url-1   =
  "http://papers.nips.cc/paper/6139-supervised-word-movers-distance.pdf"
}

@ARTICLE{sennrich-2015-neural-machin,
  author       = "{Sennrich}, R. and {Haddow}, B. and {Birch}, A.",
  title        = "{Neural Machine Translation of Rare Words With Subword
  Units}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150807909S",
  archivePrefix= "arXiv",
  eprint       = "1508.07909",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL"
}

@ARTICLE{ling-2015-findin-funct-form,
  author       = "{Ling}, W. and {Lu{\'{\i}}s}, T. and {Marujo}, L. and
                  {Fernandez Astudillo}, R. and {Amir}, S. and {Dyer}, C. and
                  {Black}, A.~W. and {Trancoso}, I.",
  title        = "{Finding Function in Form: Compositional Character Models for
                  Open Vocabulary Word Representation}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150802096L",
  archivePrefix= "arXiv",
  eprint       = "1508.02096",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL"
}

@inproceedings{kim-2016-char-aware,
  title        = "Character-Aware Neural Language Models.",
  author       = "Kim, Yoon and Jernite, Yacine and Sontag, David and Rush,
                  Alexander M",
  booktitle    = "AAAI",
  pages        = "2741-2749",
  year         = 2016
}

@article{achananuparp-2008-evaluation-sentence,
  title        = "The evaluation of sentence similarity measures",
  author       = "Achananuparp, Palakorn and Hu, Xiaohua and Shen, Xiajiong",
  journal      = "Data warehousing and knowledge discovery",
  pages        = "305-316",
  year         = 2008,
  publisher    = "Springer"
}

@ARTICLE{bradbury-2016-quasi-recur-neural-networ,
  author       = "{Bradbury}, J. and {Merity}, S. and {Xiong}, C. and {Socher},
                  R.",
  title        = "{Quasi-Recurrent Neural Networks}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161101576B",
  archivePrefix= "arXiv",
  eprint       = "1611.01576",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Artificial Intelligence, Computer Science
                  - Computation and Language, Computer Science - Learning",
  month        = nov
}

@ARTICLE{ballesteros-2015-improv-trans,
  author       = "{Ballesteros}, M. and {Dyer}, C. and {Smith}, N.~A.",
  title        = "{Improved Transition-Based Parsing By Modeling Characters
                  Instead of Words With LSTMs}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150800657B",
  archivePrefix= "arXiv",
  eprint       = "1508.00657",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL"
}

@ARTICLE{wiseman-2016-sequen-to,
  author       = "{Wiseman}, S. and {Rush}, A.~M.",
  title        = "{Sequence-To-Sequence Learning As Beam-Search Optimization}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160602960W",
  archivePrefix= "arXiv",
  eprint       = "1606.02960",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing, Statistics
  - Machine Learning",
  month        = jun,
  primaryClass = "cs.CL"
}

@ARTICLE{snoek-2012-pract-bayes,
  author       = "{Snoek}, J. and {Larochelle}, H. and {Adams}, R.~P.",
  title        = "{Practical Bayesian Optimization of Machine Learning
                  Algorithms}",
  journal      = "ArXiv e-prints",
  year         = 2012,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2012arXiv1206.2944S",
  archivePrefix= "arXiv",
  eprint       = "1206.2944",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = jun,
  primaryClass = "stat.ML"
}

@article{hashimoto-2016-word-embed,
  title        = "Word embeddings as metric recovery in semantic spaces",
  author       = "Hashimoto, Tatsunori B and Alvarez-Melis, David and Jaakkola,
                  Tommi S",
  journal      = "Transactions of the Association for Computational
  Linguistics",
  volume       = 4,
  pages        = "273-286",
  year         = 2016
}

@inproceedings{mnih-2007-three-graph,
  title        = "Three new graphical models for statistical language
  modelling",
  author       = "Mnih, Andriy and Hinton, Geoffrey",
  booktitle    = "Proceedings of the 24th international conference on Machine
                  learning",
  pages        = "641-648",
  year         = 2007,
  organization = "ACM"
}
@ARTICLE{chawla-2011-smote,
  author       = "{Chawla}, N.~V. and {Bowyer}, K.~W. and {Hall}, L.~O. and
                  {Kegelmeyer}, W.~P.",
  title        = "{SMOTE: Synthetic Minority Over-Sampling Technique}",
  journal      = "ArXiv e-prints",
  year         = 2011,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2011arXiv1106.1813C",
  archivePrefix= "arXiv",
  eprint       = "1106.1813",
  keywords     = "Computer Science - Artificial Intelligence",
  month        = jun,
  primaryClass = "cs.AI"
}

@inproceedings{klein-2001-parsing-treebank,
  title        = "Parsing with treebank grammars: Empirical bounds, theoretical
                  models, and the structure of the Penn treebank",
  author       = "Klein, Dan and Manning, Christopher D",
  booktitle    = "Proceedings of the 39th Annual Meeting on Association for
                  Computational Linguistics",
  pages        = "338-345",
  year         = 2001,
  organization = "Association for Computational Linguistics"
}

@article{collins-2003-head-driven,
  title        = "Head-driven statistical models for natural language parsing",
  author       = "Collins, Michael",
  journal      = "Computational linguistics",
  volume       = 29,
  number       = 4,
  pages        = "589-637",
  year         = 2003,
  publisher    = "MIT Press"
}

@inproceedings{collins-1997-three-generative,
  title        = "Three generative, lexicalised models for statistical parsing",
  author       = "Collins, Michael",
  booktitle    = "Proceedings of the eighth conference on European chapter of
                  the Association for Computational Linguistics",
  pages        = "16-23",
  year         = 1997,
  organization = "Association for Computational Linguistics"
}

@inproceedings{bikel-2004-distributional-analysis,
  title        = "A Distributional Analysis of a Lexicalized Statistical
                  Parsing Mode.",
  author       = "Bikel, Daniel M",
  booktitle    = "EMNLP",
  pages        = "182-189",
  year         = 2004
}

@inproceedings{chen-2014-fast-acc,
  title        = "A fast and accurate dependency parser using neural networks",
  author       = "Chen, Danqi and Manning, Christopher",
  booktitle    = "Proceedings of the 2014 conference on empirical methods in
                  natural language processing (EMNLP)",
  pages        = "740-750",
  year         = 2014
}

@inproceedings{socher-2013-su-rnn,
  title        = "Parsing with compositional vector grammars",
  author       = "Socher, Richard and Bauer, John and Manning, Christopher D
                  and others",
  booktitle    = "Proceedings of the 51st Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "455-465",
  year         = 2013
}

@inproceedings{levy-2003-harder-parse,
  title        = "Is it harder to parse Chinese, or the Chinese Treebank?",
  author       = "Levy, Roger and Manning, Christopher",
  booktitle    = "Proceedings of the 41st Annual Meeting on Association for
                  Computational Linguistics-Volume 1",
  pages        = "439-446",
  year         = 2003,
  organization = "Association for Computational Linguistics"
}

@inproceedings{chang-2009-discriminative-reorder,
  title        = "Discriminative reordering with Chinese grammatical relations
                  features",
  author       = "Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and
                  Manning, Christopher D",
  booktitle    = "Proceedings of the Third Workshop on Syntax and Structure in
                  Statistical Translation",
  pages        = "51-59",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{zhu-2013-fast-acc,
  title        = "Fast and Accurate Shift-Reduce Constituent Parsing.",
  author       = "Zhu, Muhua and Zhang, Yue and Chen, Wenliang and Zhang, Min
                  and Zhu, Jingbo",
  booktitle    = "ACL (1)",
  pages        = "434-443",
  year         = 2013
}

@inproceedings{klein-2003-accurate-unlex,
  title        = "Accurate unlexicalized parsing",
  author       = "Klein, Dan and Manning, Christopher D",
  booktitle    = "Proceedings of the 41st Annual Meeting on Association for
                  Computational Linguistics-Volume 1",
  pages        = "423-430",
  year         = 2003,
  organization = "Association for Computational Linguistics"
}

@inproceedings{klein-2003-fast-exact,
  title        = "Fast exact inference with a factored model for natural
                  language parsing",
  author       = "Klein, Dan and Manning, Christopher D",
  booktitle    = "Advances in neural information processing systems",
  pages        = "3-10",
  year         = 2003
}

@inproceedings{nivre-2016-universal-depend,
  title        = "Universal Dependencies v1: A Multilingual Treebank
                  Collection.",
  author       = "Nivre, Joakim and de Marneffe, Marie-Catherine and Ginter,
                  Filip and Goldberg, Yoav and Hajic, Jan and Manning,
                  Christopher D and McDonald, Ryan T and Petrov, Slav and
                  Pyysalo, Sampo and Silveira, Natalia and others",
  booktitle    = "LREC",
  year         = 2016
}

@inproceedings{de-2006-generating-typed,
  title        = "Generating typed dependency parses from phrase structure
                  parses",
  author       = "De Marneffe, Marie-Catherine and MacCartney, Bill and
                  Manning, Christopher D and others",
  booktitle    = "Proceedings of LREC",
  volume       = 6,
  number       = 2006,
  pages        = "449-454",
  year         = 2006,
  organization = "Genoa Italy"
}

@ARTICLE{Krotov-1999-compact-penn,
  author       = "{Krotov}, A. and {Hepple}, M. and {Gaizauskas}, R. and
                  {Wilks}, Y.",
  title        = "{Compacting the Penn Treebank Grammar}",
  journal      = "eprint arXiv:cs/9902001",
  eprint       = "cs/9902001",
  keywords     = "Computer Science - Computation and Language, I.2.7",
  year         = 1999,
  month        = jan,
  adsurl       = "http://adsabs.harvard.edu/abs/1999cs........2001K",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{toutanova-2000-enriching-knowledge,
  title        = "Enriching the knowledge sources used in a maximum entropy
                  part-of-speech tagger",
  author       = "Toutanova, Kristina and Manning, Christopher D",
  booktitle    = "Proceedings of the 2000 Joint SIGDAT conference on Empirical
                  methods in natural language processing and very large
                  corpora: held in conjunction with the 38th Annual Meeting of
                  the Association for Computational Linguistics-Volume 13",
  pages        = "63-70",
  year         = 2000,
  organization = "Association for Computational Linguistics"
}

@inproceedings{toutanova-2003-feature-rich,
  title        = "Feature-rich part-of-speech tagging with a cyclic dependency
                  network",
  author       = "Toutanova, Kristina and Klein, Dan and Manning, Christopher D
                  and Singer, Yoram",
  booktitle    = "Proceedings of the 2003 Conference of the North American
                  Chapter of the Association for Computational Linguistics on
                  Human Language Technology-Volume 1",
  pages        = "173-180",
  year         = 2003,
  organization = "Association for Computational Linguistics"
}

@ARTICLE{chen-2016-thoroug-examin,
  author       = "{Chen}, D. and {Bolton}, J. and {Manning}, C.~D.",
  title        = "{A Thorough Examination of the Cnn/daily Mail Reading
                  Comprehension Task}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160602858C",
  archivePrefix= "arXiv",
  eprint       = "1606.02858",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence",
  month        = jun,
  primaryClass = "cs.CL"
}

@ARTICLE{dhingra-2016-gated-atten,
  author       = "{Dhingra}, B. and {Liu}, H. and {Yang}, Z. and {Cohen},
                  W.~W. and {Salakhutdinov}, R.",
  title        = "{Gated-Attention Readers for Text Comprehension}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160601549D",
  archivePrefix= "arXiv",
  eprint       = "1606.01549",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jun,
  primaryClass = "cs.CL"
}

@ARTICLE{kadlec-2016-text-under,
  author       = "{Kadlec}, R. and {Schmid}, M. and {Bajgar}, O. and
                  {Kleindienst}, J.",
  title        = "{Text Understanding With the Attention Sum Reader Network}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160301547K",
  archivePrefix= "arXiv",
  eprint       = "1603.01547",
  keywords     = "Computer Science - Computation and Language",
  month        = mar,
  primaryClass = "cs.CL"
}

@ARTICLE{tseng-2016-towar-machin,
  author       = "{Tseng}, B.-H. and {Shen}, S.-S. and {Lee}, H.-Y. and {Lee},
                  L.-S.",
  title        = "{Towards Machine Comprehension of Spoken Content: Initial
                  Toefl Listening Comprehension Test By Machine}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160806378T",
  archivePrefix= "arXiv",
  eprint       = "1608.06378",
  keywords     = "Computer Science - Computation and Language",
  month        = aug,
  primaryClass = "cs.CL"
}

@ARTICLE{cui-2016-consen-atten,
  author       = "{Cui}, Y. and {Liu}, T. and {Chen}, Z. and {Wang}, S. and
                  {Hu}, G.",
  title        = "{Consensus Attention-Based Neural Networks for Chinese
                  Reading Comprehension}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160702250C",
  archivePrefix= "arXiv",
  eprint       = "1607.02250",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = jul,
  primaryClass = "cs.CL"
}

@ARTICLE{cui-2016-atten-over,
  author       = "{Cui}, Y. and {Chen}, Z. and {Wei}, S. and {Wang}, S. and
                  {Liu}, T. and {Hu}, G.",
  title        = "{Attention-Over-Attention Neural Networks for Reading
                  Comprehension}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160704423C",
  archivePrefix= "arXiv",
  eprint       = "1607.04423",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = jul,
  primaryClass = "cs.CL"
}

@ARTICLE{wang-2016-machin-compr,
  author       = "{Wang}, S. and {Jiang}, J.",
  title        = "{Machine Comprehension Using Match-Lstm and Answer Pointer}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160807905W",
  archivePrefix= "arXiv",
  eprint       = "1608.07905",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence",
  month        = aug,
  primaryClass = "cs.CL"
}

@ARTICLE{sordoni-2016-iterat-alter,
  author       = "{Sordoni}, A. and {Bachman}, P. and {Trischler}, A. and
                  {Bengio}, Y.",
  title        = "{Iterative Alternating Neural Attention for Machine Reading}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160602245S",
  archivePrefix= "arXiv",
  eprint       = "1606.02245",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = jun,
  primaryClass = "cs.CL"
}

@inproceedings{kobayashi-2016-dynamic-entity,
  title        = "Dynamic Entity Representation with Max-pooling Improves
                  Machine Reading",
  author       = "Kobayashi, Sosuke and Tian, Ran and Okazaki, Naoaki and Inui,
                  Kentaro",
  booktitle    = "Proceedings of NAACL-HLT",
  pages        = "850-855",
  year         = 2016
}

@ARTICLE{trischler-2016-natur-languag,
  author       = "{Trischler}, A. and {Ye}, Z. and {Yuan}, X. and {Suleman},
  K.",
  title        = "{Natural Language Comprehension With the EpiReader}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160602270T",
  archivePrefix= "arXiv",
  eprint       = "1606.02270",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL"
}

@ARTICLE{creswell-2017-gener-adver-networ,
  author       = "{Creswell}, A. and {White}, T. and {Dumoulin}, V. and
                  {Arulkumaran}, K. and {Sengupta}, B. and {Bharath}, A.~A",
  title        = "{Generative Adversarial Networks: An Overview}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171007035C",
  archivePrefix= "arXiv",
  eprint       = "1710.07035",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = oct,
  primaryClass = "cs.CV"
}

@ARTICLE{weston-2014-memor-networ,
  author       = "{Weston}, J. and {Chopra}, S. and {Bordes}, A.",
  title        = "{Memory Networks}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1410.3916W",
  archivePrefix= "arXiv",
  eprint       = "1410.3916",
  keywords     = "Computer Science - Artificial Intelligence, Computer Science
                  - Computation and Language, Statistics - Machine Learning",
  month        = oct,
  primaryClass = "cs.AI"
}

@ARTICLE{munkhdalai-2016-neural-seman-encod,
  author       = "{Munkhdalai}, T. and {Yu}, H.",
  title        = "{Neural Semantic Encoders}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160704315M",
  archivePrefix= "arXiv",
  eprint       = "1607.04315",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Statistics - Machine Learning",
  month        = jul,
  primaryClass = "cs.LG"
}

@ARTICLE{nickel-2017-poinc-embed,
  author       = "{Nickel}, M. and {Kiela}, D.",
  title        = "{Poincar$\backslash$'e Embeddings for Learning Hierarchical
                  Representations}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170508039N",
  archivePrefix= "arXiv",
  eprint       = "1705.08039",
  keywords     = "Computer Science - Artificial Intelligence, Computer Science
                  - Learning, Statistics - Machine Learning",
  month        = may,
  primaryClass = "cs.AI"
}

@ARTICLE{weston-2015-towar-ai,
  author       = "{Weston}, J. and {Bordes}, A. and {Chopra}, S. and {Rush},
                  A.~M. and {van Merri{\"e}nboer}, B. and {Joulin}, A. and
                  {Mikolov}, T.",
  title        = "{Towards Ai-Complete Question Answering: A Set of
                  Prerequisite Toy Tasks}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150205698W",
  archivePrefix= "arXiv",
  eprint       = "1502.05698",
  keywords     = "Computer Science - Artificial Intelligence, Computer Science
                  - Computation and Language, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.AI"
}

@ARTICLE{sabour-2017-dynam-routin-between-capsul,
  author       = "{Sabour}, S. and {Frosst}, N. and {E Hinton}, G.",
  title        = "{Dynamic Routing Between Capsules}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171009829S",
  archivePrefix= "arXiv",
  eprint       = "1710.09829",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  month        = oct,
  primaryClass = "cs.CV"
}

@ARTICLE{lu-2017-depth-creat,
  author       = "{Lu}, H. and {Kawaguchi}, K.",
  title        = "{Depth Creates No Bad Local Minima}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170208580L",
  archivePrefix= "arXiv",
  eprint       = "1702.08580",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing, Mathematics - Optimization and
                  Control, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.LG"
}

@ARTICLE{kawaguchi-2017-gener-deep-learn,
  author       = "{Kawaguchi}, K. and {Pack Kaelbling}, L. and {Bengio}, Y.",
  title        = "{Generalization in Deep Learning}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171005468K",
  archivePrefix= "arXiv",
  eprint       = "1710.05468",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Learning, Computer Science -
                  Neural and Evolutionary Computing",
  month        = oct,
  primaryClass = "stat.ML"
}

@article{wolpert-1997-no-free-lunch,
  title        = "No free lunch theorems for optimization",
  author       = "Wolpert, David H and Macready, William G",
  journal      = "IEEE transactions on evolutionary computation",
  volume       = 1,
  number       = 1,
  pages        = "67-82",
  year         = 1997,
  publisher    = "IEEE"
}

@inproceedings{recasens-2013-life-death,
  title        = "The Life and Death of Discourse Entities: Identifying
                  Singleton Mentions.",
  author       = "Recasens, Marta and de Marneffe, Marie-Catherine and Potts,
                  Christopher",
  year         = 2013,
  booktitle    = "''"
}

@inproceedings{lee-2011-stanford-multi-pass,
  title        = "Stanford's multi-pass sieve coreference resolution system at
                  the CoNLL-2011 shared task",
  author       = "Lee, Heeyoung and Peirsman, Yves and Chang, Angel and
                  Chambers, Nathanael and Surdeanu, Mihai and Jurafsky, Dan",
  booktitle    = "Proceedings of the fifteenth conference on computational
                  natural language learning: Shared task",
  pages        = "28-34",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@inproceedings{raghunathan-2010-multi-pass-sieve,
  title        = "A multi-pass sieve for coreference resolution",
  author       = "Raghunathan, Karthik and Lee, Heeyoung and Rangarajan,
                  Sudarshan and Chambers, Nathanael and Surdeanu, Mihai and
                  Jurafsky, Dan and Manning, Christopher",
  booktitle    = "Proceedings of the 2010 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "492-501",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@article{lee-2013-deterministic-coreference,
  title        = "Deterministic coreference resolution based on entity-centric,
                  precision-ranked rules",
  author       = "Lee, Heeyoung and Chang, Angel and Peirsman, Yves and
                  Chambers, Nathanael and Surdeanu, Mihai and Jurafsky, Dan",
  journal      = "Computational Linguistics",
  volume       = 39,
  number       = 4,
  pages        = "885-916",
  year         = 2013,
  publisher    = "MIT Press"
}

@inproceedings{clark-2015-entity-centric,
  title        = "Entity-Centric Coreference Resolution with Model Stacking.",
  author       = "Clark, Kevin and Manning, Christopher D",
  booktitle    = "ACL (1)",
  pages        = "1405-1415",
  year         = 2015
}

@ARTICLE{clark-2016-rl-for-cr,
  author       = "{Clark}, K. and {Manning}, C.~D.",
  title        = "{Deep Reinforcement Learning for Mention-Ranking Coreference
                  Models}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160908667C",
  archivePrefix= "arXiv",
  eprint       = "1609.08667",
  keywords     = "Computer Science - Computation and Language",
  month        = sep,
  primaryClass = "cs.CL"
}

@ARTICLE{clark-2016-improv-coref,
  author       = "{Clark}, K. and {Manning}, C.~D.",
  title        = "{Improving Coreference Resolution By Learning Entity-Level
                  Distributed Representations}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160601323C",
  archivePrefix= "arXiv",
  eprint       = "1606.01323",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL"
}

@InProceedings{recasens-2013-same-referent,
  author       = "Recasens, Marta and Can, Matthew and Jurafsky, Daniel",
  title        = "Same Referent, Different Words: Unsupervised Mining of Opaque
                  Coreferent Mentions",
  booktitle    = "Proceedings of the 2013 Conference of the North American
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies",
  year         = 2013,
  publisher    = "Association for Computational Linguistics",
  pages        = "897-906",
  location     = "Atlanta, Georgia",
  url          =
  "http://aclanthology.coli.uni-saarland.de/pdf/N/N13/N13-1110.pdf"
}

@inproceedings{lee-2012-joint-entity,
  title        = "Joint entity and event coreference resolution across
                  documents",
  author       = "Lee, Heeyoung and Recasens, Marta and Chang, Angel and
                  Surdeanu, Mihai and Jurafsky, Dan",
  booktitle    = "Proceedings of the 2012 Joint Conference on Empirical Methods
                  in Natural Language Processing and Computational Natural
                  Language Learning",
  pages        = "489-500",
  year         = 2012,
  organization = "Association for Computational Linguistics"
}

@ARTICLE{lee-2017-end-to,
  author       = "{Lee}, K. and {He}, L. and {Lewis}, M. and {Zettlemoyer}, L.",
  title        = "{End-To-End Neural Coreference Resolution}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170707045L",
  archivePrefix= "arXiv",
  eprint       = "1707.07045",
  keywords     = "Computer Science - Computation and Language",
  month        = jul,
  primaryClass = "cs.CL"
}

@ARTICLE{radford-2017-learn-to,
  author       = "{Radford}, A. and {Jozefowicz}, R. and {Sutskever}, I.",
  title        = "{Learning To Generate Reviews and Discovering Sentiment}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170401444R",
  archivePrefix= "arXiv",
  eprint       = "1704.01444",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Computer Science - Neural and Evolutionary
                  Computing",
  month        = apr,
  primaryClass = "cs.LG"
}

@ARTICLE{felbo-2017-using-million,
  author       = "{Felbo}, B. and {Mislove}, A. and {S{\o}gaard}, A. and
                  {Rahwan}, I. and {Lehmann}, S.",
  title        = "{Using Millions of Emoji Occurrences To Learn Any-Domain
                  Representations for Detecting Sentiment, Emotion and
                  sarcasm}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170800524F",
  archivePrefix= "arXiv",
  eprint       = "1708.00524",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = aug,
  primaryClass = "stat.ML"
}

@PHDTHESIS{hamdan-2016-under-coupl,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016PhDT........44H",
  author       = "{Hamdan}, L.",
  school       = "West Virginia University",
  title        = "{Understanding Coupling of Global and Diffuse Solar Radiation
                  with Climatic Variability}",
  year         = 2016
}

@ARTICLE{zhang-2017-which-encod,
  author       = "{Zhang}, X. and {LeCun}, Y.",
  title        = "{Which Encoding Is the Best for Text Classification in
                  Chinese, English, Japanese and Korean?}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170802657Z",
  archivePrefix= "arXiv",
  eprint       = "1708.02657",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = aug,
  primaryClass = "cs.CL"
}

@article{liu-2012-sentiment-analysis,
  title        = "Sentiment analysis and opinion mining",
  author       = "Liu, Bing",
  journal      = "Synthesis lectures on human language technologies",
  volume       = 5,
  number       = 1,
  pages        = "1-167",
  year         = 2012,
  publisher    = "Morgan \& Claypool Publishers"
}

@article{pang-2008-opinion-mining,
  title        = "Opinion mining and sentiment analysis",
  author       = "Pang, Bo and Lee, Lillian and others",
  journal      = "Foundations and Trends{\textregistered} in Information
                  Retrieval",
  volume       = 2,
  number       = "1--2",
  pages        = "1-135",
  year         = 2008,
  publisher    = "Now Publishers, Inc."
}

@ARTICLE{rajpurkar-2016-squad,
  author       = "{Rajpurkar}, P. and {Zhang}, J. and {Lopyrev}, K. and
                  {Liang}, P.",
  title        = "{SQuAD: 100,000+ Questions for Machine Comprehension of
  Text}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160605250R",
  archivePrefix= "arXiv",
  eprint       = "1606.05250",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL"
}

@ARTICLE{miwa-2016-end-to,
  author       = "{Miwa}, M. and {Bansal}, M.",
  title        = "{End-To-End Relation Extraction Using Lstms on Sequences and
                  Tree Structures}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160100770M",
  archivePrefix= "arXiv",
  eprint       = "1601.00770",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = jan,
  primaryClass = "cs.CL"
}

@ARTICLE{kumar-2017-survey-deep,
  author       = "{Kumar}, S.",
  title        = "{A Survey of Deep Learning Methods for Relation Extraction}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170503645K",
  archivePrefix= "arXiv",
  eprint       = "1705.03645",
  keywords     = "Computer Science - Computation and Language",
  month        = may,
  primaryClass = "cs.CL"
}

@inproceedings{lin-2016-neural-relation,
  title        = "Neural Relation Extraction with Selective Attention over
                  Instances.",
  author       = "Lin, Yankai and Shen, Shiqi and Liu, Zhiyuan and Luan, Huanbo
                  and Sun, Maosong",
  booktitle    = "ACL (1)",
  year         = 2016
}

@inproceedings{wu-2017-adversarial-train,
  title        = "Adversarial Training for Relation Extraction",
  author       = "Wu, Yi and Bamman, David and Russell, Stuart",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1779-1784",
  year         = 2017
}

@ARTICLE{lei-2017-train-rnns,
  author       = "{Lei}, T. and {Zhang}, Y.",
  title        = "{Training Rnns As Fast As CNNs}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170902755L",
  archivePrefix= "arXiv",
  eprint       = "1709.02755",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing",
  month        = sep,
  primaryClass = "cs.CL"
}

@ARTICLE{rocktaeschel-2015-reason-about,
  author       = "{Rockt{\"a}schel}, T. and {Grefenstette}, E. and {Hermann},
                  K.~M. and {Ko{\v c}isk{\'y}}, T. and {Blunsom}, P.",
  title        = "{Reasoning About Entailment With Neural Attention}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150906664R",
  archivePrefix= "arXiv",
  eprint       = "1509.06664",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning, Computer Science -
  Neural and Evolutionary Computing, 68T50, I.2.6, I.2.7",
  month        = sep,
  primaryClass = "cs.CL"
}

@inproceedings{bowman-2015-large-annotated,
  Author       = "Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher,
                  and Manning, Christopher D.",
  Booktitle    = "Proceedings of the 2015 Conference on Empirical Methods in
                  Natural Language Processing (EMNLP)",
  Publisher    = "Association for Computational Linguistics",
  Title        = "A large annotated corpus for learning natural language
                  inference",
  Year         = 2015
}

@ARTICLE{zolna-2017-fraternal-dropout,
  author       = "{Zolna}, K. and {Arpit}, D. and {Suhubdy}, D. and {Bengio},
                  Y.",
  title        = "{Fraternal Dropout}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171100066Z",
  archivePrefix= "arXiv",
  eprint       = "1711.00066",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Learning",
  month        = oct,
  primaryClass = "stat.ML"
}

@ARTICLE{vinyals-2015-order-matter,
  author       = "{Vinyals}, O. and {Bengio}, S. and {Kudlur}, M.",
  title        = "{Order Matters: Sequence To Sequence for sets}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151106391V",
  archivePrefix= "arXiv",
  eprint       = "1511.06391",
  keywords     = "Statistics - Machine Learning, Computer Science - Computation
                  and Language, Computer Science - Learning",
  month        = nov,
  primaryClass = "stat.ML"
}

@ARTICLE{clauset-2007-power-law,
  author       = "{Clauset}, A. and {Rohilla Shalizi}, C. and {Newman},
                  M.~E.~J.",
  title        = "{Power-Law Distributions in Empirical data}",
  journal      = "ArXiv e-prints",
  year         = 2007,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2007arXiv0706.1062C",
  archivePrefix= "arXiv",
  eprint       = "0706.1062",
  keywords     = "Physics - Data Analysis, Statistics and Probability,
                  Condensed Matter - Disordered Systems and Neural Networks,
                  Statistics - Applications, Statistics - Methodology",
  month        = jun,
  primaryClass = "physics.data-an"
}

@ARTICLE{gu-2016-incor-copyin,
  author       = "{Gu}, J. and {Lu}, Z. and {Li}, H. and {Li}, V.~O.~K.",
  title        = "{Incorporating Copying Mechanism in Sequence-To-Sequence
                  Learning}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160306393G",
  archivePrefix= "arXiv",
  eprint       = "1603.06393",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning, Computer Science -
  Neural and Evolutionary Computing",
  month        = mar,
  primaryClass = "cs.CL"
}

@ARTICLE{see-2017-get-to-point,
  author       = "{See}, A. and {Liu}, P.~J. and {Manning}, C.~D.",
  title        = "{Get To The Point: Summarization With Pointer-Generator
                  Networks}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170404368S",
  archivePrefix= "arXiv",
  eprint       = "1704.04368",
  keywords     = "Computer Science - Computation and Language",
  month        = apr,
  primaryClass = "cs.CL"
}

@inproceedings{he-2017-generating-natural,
  title        = "Generating natural answers by incorporating copying and
                  retrieving mechanisms in sequence-to-sequence learning",
  author       = "He, Shizhu and Liu, Cao and Liu, Kang and Zhao, Jun",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "199-208",
  year         = 2017
}

@ARTICLE{wang-2015-learn-natur,
  author       = "{Wang}, S. and {Jiang}, J.",
  title        = "{Learning Natural Language Inference With LSTM}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151208849W",
  archivePrefix= "arXiv",
  eprint       = "1512.08849",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Neural and Evolutionary
  Computing",
  month        = dec,
  primaryClass = "cs.CL"
}

@ARTICLE{yu-2016-seqgan,
  author       = "{Yu}, L. and {Zhang}, W. and {Wang}, J. and {Yu}, Y.",
  title        = "{SeqGAN: Sequence Generative Adversarial Nets With Policy
                  Gradient}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160905473Y",
  archivePrefix= "arXiv",
  eprint       = "1609.05473",
  keywords     = "Computer Science - Learning, Computer Science - Artificial
                  Intelligence",
  month        = sep,
  primaryClass = "cs.LG"
}

@ARTICLE{gulrajani-2017-improv-train-wasser-gans,
  author       = "{Gulrajani}, I. and {Ahmed}, F. and {Arjovsky}, M. and
                  {Dumoulin}, V. and {Courville}, A.",
  title        = "{Improved Training of Wasserstein GANs}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170400028G",
  archivePrefix= "arXiv",
  eprint       = "1704.00028",
  keywords     = "Computer Science - Learning, Statistics - Machine Learning",
  month        = mar,
  primaryClass = "cs.LG"
}

@ARTICLE{dauphin-2016-languag-model,
  author       = "{Dauphin}, Y.~N. and {Fan}, A. and {Auli}, M. and {Grangier},
                  D.",
  title        = "{Language Modeling With Gated Convolutional Networks}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161208083D",
  archivePrefix= "arXiv",
  eprint       = "1612.08083",
  keywords     = "Computer Science - Computation and Language",
  month        = dec,
  primaryClass = "cs.CL"
}

@ARTICLE{kuchaiev-2017-factor-trick-lstm,
  author       = "{Kuchaiev}, O. and {Ginsburg}, B.",
  title        = "{Factorization Tricks for Lstm networks}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170310722K",
  archivePrefix= "arXiv",
  eprint       = "1703.10722",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Neural and Evolutionary Computing, Statistics - Machine Learning",
  month        = mar,
  primaryClass = "cs.CL"
}

@ARTICLE{artetxe-2017-unsup-neural-machin-trans,
  author       = "{Artetxe}, M. and {Labaka}, G. and {Agirre}, E. and {Cho},
  K.",
  title        = "{Unsupervised Neural Machine Translation}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171011041A",
  archivePrefix= "arXiv",
  eprint       = "1710.11041",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Learning",
  month        = oct,
  primaryClass = "cs.CL"
}

@inproceedings{artetxe-2016-learning-principled,
  author       = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
  title        = "Learning principled bilingual mappings of word embeddings
                  while preserving monolingual invariance",
  booktitle    = "Proceedings of the 2016 Conference on Empirical Methods in
                  Natural Language Processing",
  year         = 2016,
  pages        = "2289-2294"
}

@inproceedings{artetxe-2017-learning-bilingual,
  author       = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
  title        = "Learning bilingual word embeddings with (almost) no bilingual
                  data",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  year         = 2017,
  pages        = "451-462"
}

@ARTICLE{sagun-2016-eigen-hessian-deep-learn,
  author       = "{Sagun}, L. and {Bottou}, L. and {LeCun}, Y.",
  title        = "{Eigenvalues of the Hessian in Deep Learning: Singularity and
                  Beyond}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161107476S",
  archivePrefix= "arXiv",
  eprint       = "1611.07476",
  keywords     = "Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.LG"
}

@ARTICLE{zhou-2017-incep-score,
  author       = "{Zhou}, Z. and {Zhang}, W. and {Wang}, J.",
  title        = "{Inception Score, Label Smoothing, Gradient Vanishing and
  -log(D(x)) Alternative}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170801729Z",
  archivePrefix= "arXiv",
  eprint       = "1708.01729",
  keywords     = "Computer Science - Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Computer Vision and Pattern
                  Recognition, Statistics - Machine Learning",
  month        = aug,
  primaryClass = "cs.LG"
}

@ARTICLE{dauphin-2014-ident-attac,
  author       = "{Dauphin}, Y. and {Pascanu}, R. and {Gulcehre}, C. and {Cho},
                  K. and {Ganguli}, S. and {Bengio}, Y.",
  title        = "{Identifying and Attacking the Saddle Point Problem in
                  High-Dimensional Non-Convex optimization}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1406.2572D",
  archivePrefix= "arXiv",
  eprint       = "1406.2572",
  keywords     = "Computer Science - Learning, Mathematics - Optimization and
                  Control, Statistics - Machine Learning",
  month        = jun,
  primaryClass = "cs.LG"
}

@ARTICLE{bottou-2016-optim-method,
  author       = "{Bottou}, L. and {Curtis}, F.~E. and {Nocedal}, J.",
  title        = "{Optimization Methods for Large-Scale Machine Learning}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160604838B",
  archivePrefix= "arXiv",
  eprint       = "1606.04838",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning,
                  Mathematics - Optimization and Control",
  month        = jun,
  primaryClass = "stat.ML"
}

@ARTICLE{berahas-2016-multi-batch,
  author       = "{Berahas}, A.~S. and {Nocedal}, J. and {Tak{\'a}{\v c}}, M.",
  title        = "{A Multi-Batch L-Bfgs Method for Machine Learning}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160506049B",
  archivePrefix= "arXiv",
  eprint       = "1605.06049",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning, Statistics - Machine Learning",
  month        = may,
  primaryClass = "math.OC"
}

@phdthesis{martens-2016-second-order,
  title        = "Second-order optimization for neural networks",
  author       = "Martens, James",
  year         = 2016,
  school       = "University of Toronto (Canada)"
}

@ARTICLE{mahsereci-2015-probab-line,
  author       = "{Mahsereci}, M. and {Hennig}, P.",
  title        = "{Probabilistic Line Searches for Stochastic Optimization}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150202846M",
  archivePrefix= "arXiv",
  eprint       = "1502.02846",
  keywords     = "Computer Science - Learning, Mathematics - Optimization and
                  Control, Statistics - Machine Learning",
  month        = feb,
  primaryClass = "cs.LG"
}

@ARTICLE{tan-2016-barzilai-borwein,
  author       = "{Tan}, C. and {Ma}, S. and {Dai}, Y.-H. and {Qian}, Y.",
  title        = "{Barzilai-Borwein Step Size for Stochastic Gradient Descent}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1605.04131",
  primaryClass = "math.OC",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning, Statistics - Machine Learning",
  year         = 2016,
  month        = may,
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160504131T",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{mass-2015-speed-learn,
  author       = "{Mass{\'e}}, P.-Y. and {Ollivier}, Y.",
  title        = "{Speed learning on the fly}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1511.02540",
  primaryClass = "math.OC",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning, Statistics - Machine Learning",
  year         = 2015,
  month        = nov,
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151102540M",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{moritz-2015-linear-conver,
  author       = "{Moritz}, P. and {Nishihara}, R. and {Jordan}, M.~I.",
  title        = "{A Linearly-Convergent Stochastic L-Bfgs Algorithm}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150802087M",
  archivePrefix= "arXiv",
  eprint       = "1508.02087",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning, Mathematics - Numerical Analysis, Statistics -
                  Computation, Statistics - Machine Learning",
  month        = aug,
  primaryClass = "math.OC"
}

@ARTICLE{byrd-2014-stoch-quasi,
  author       = "{Byrd}, R.~H. and {Hansen}, S.~L. and {Nocedal}, J. and
                  {Singer}, Y.",
  title        = "{A Stochastic Quasi-Newton Method for Large-Scale
                  Optimization}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1401.7020B",
  archivePrefix= "arXiv",
  eprint       = "1401.7020",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning, Statistics - Machine Learning",
  month        = jan,
  primaryClass = "math.OC"
}

@article{pearlmutter-1994-fast-exact,
  title        = "Fast exact multiplication by the Hessian",
  author       = "Pearlmutter, Barak A",
  journal      = "Neural computation",
  volume       = 6,
  number       = 1,
  pages        = "147-160",
  year         = 1994,
  publisher    = "MIT Press"
}

@ARTICLE{agarwal-2016-second-order,
  author       = "{Agarwal}, N. and {Bullins}, B. and {Hazan}, E.",
  title        = "{Second Order Stochastic Optimization in Linear Time}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1602.03943",
  primaryClass = "stat.ML",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  year         = 2016,
  month        = feb,
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160203943A",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{pascanu-2014-saddl-point,
  author       = "{Pascanu}, R. and {Dauphin}, Y.~N. and {Ganguli}, S. and
                  {Bengio}, Y.",
  title        = "{On the Saddle Point Problem for Non-Convex optimization}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1405.4604P",
  archivePrefix= "arXiv",
  eprint       = "1405.4604",
  keywords     = "Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = may,
  primaryClass = "cs.LG"
}

@ARTICLE{looks-2017-deep-learn,
  author       = "{Looks}, M. and {Herreshoff}, M. and {Hutchins}, D. and
                  {Norvig}, P.",
  title        = "{Deep Learning With Dynamic Computation Graphs}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170202181L",
  archivePrefix= "arXiv",
  eprint       = "1702.02181",
  keywords     = "Computer Science - Neural and Evolutionary Computing,
                  Computer Science - Learning, Statistics - Machine Learning",
  month        = feb
}

@ARTICLE{neubig-2017-fly-operat,
  author       = "{Neubig}, G. and {Goldberg}, Y. and {Dyer}, C.",
  title        = "{On-The-Fly Operation Batching in Dynamic Computation
  Graphs}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170507860N",
  archivePrefix= "arXiv",
  eprint       = "1705.07860",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language, Statistics - Machine Learning",
  month        = may,
  primaryClass = "cs.LG"
}

@ARTICLE{klein-2017-openn,
  author       = "{Klein}, G. and {Kim}, Y. and {Deng}, Y. and {Senellart},
                  J. and {Rush}, A.~M.",
  title        = "{OpenNMT: Open-Source Toolkit for Neural Machine
  Translation}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170102810K",
  archivePrefix= "arXiv",
  eprint       = "1701.02810",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Neural and Evolutionary
  Computing",
  month        = jan,
  primaryClass = "cs.CL"
}

@inproceedings{gatys-2016-image-style,
  title        = "Image style transfer using convolutional neural networks",
  author       = "Gatys, Leon A and Ecker, Alexander S and Bethge, Matthias",
  booktitle    = "Proceedings of the IEEE Conference on Computer Vision and
                  Pattern Recognition",
  pages        = "2414-2423",
  year         = 2016
}

@ARTICLE{kingma-2014-adam,
  author       = "{Kingma}, D.~P. and {Ba}, J.",
  title        = "{Adam: A Method for Stochastic Optimization}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1412.6980K",
  archivePrefix= "arXiv",
  eprint       = "1412.6980",
  keywords     = "Computer Science - Learning",
  month        = dec,
  primaryClass = "cs.LG"
}

@ARTICLE{zhang-2016-under-deep,
  author       = "{Zhang}, C. and {Bengio}, S. and {Hardt}, M. and {Recht},
                  B. and {Vinyals}, O.",
  title        = "{Understanding Deep Learning Requires Rethinking
                  generalization}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161103530Z",
  archivePrefix= "arXiv",
  eprint       = "1611.03530",
  keywords     = "Computer Science - Learning",
  month        = nov,
  primaryClass = "cs.LG"
}

@article{duchi-2011-adaptive-subgrad,
  title        = "Adaptive subgradient methods for online learning and
                  stochastic optimization",
  author       = "Duchi, John and Hazan, Elad and Singer, Yoram",
  journal      = "Journal of Machine Learning Research",
  volume       = 12,
  number       = "Jul",
  pages        = "2121-2159",
  year         = 2011
}

@inproceedings{roth-2004-feature-selection,
  title        = "Feature selection in clustering problems",
  author       = "Roth, Volker and Lange, Tilman",
  booktitle    = "Advances in neural information processing systems",
  pages        = "473-480",
  year         = 2004
}

@ARTICLE{liu-2017-gener-adver,
  author       = "{Liu}, L. and {Lu}, Y. and {Yang}, M. and {Qu}, Q. and {Zhu},
                  J. and {Li}, H.",
  title        = "{Generative Adversarial Network for Abstractive Text
                  Summarization}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171109357L",
  archivePrefix= "arXiv",
  eprint       = "1711.09357",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence",
  month        = nov,
  primaryClass = "cs.CL"
}

@ARTICLE{moussallem-2017-machin-trans,
  author       = "{Moussallem}, D. and {Wauer}, M. and {Ngonga Ngomo}, A.-C.",
  title        = "{Machine Translation Using Semantic Web Technologies: A
                  Survey}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171109476M",
  archivePrefix= "arXiv",
  eprint       = "1711.09476",
  keywords     = "Computer Science - Computation and Language",
  month        = nov,
  primaryClass = "cs.CL"
}

@ARTICLE{smith-2015-cyclic-lr,
  author       = "{Smith}, L.~N.",
  title        = "{Cyclical Learning Rates for Training Neural Networks}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150601186S",
  archivePrefix= "arXiv",
  eprint       = "1506.01186",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Learning, Computer Science - Neural and
                  Evolutionary Computing",
  month        = jun,
  primaryClass = "cs.CV"
}

@ARTICLE{bubeck-2014-convex-optim,
  author       = "{Bubeck}, S.",
  title        = "{Convex Optimization: Algorithms and Complexity}",
  journal      = "ArXiv e-prints",
  year         = 2014,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1405.4980B",
  archivePrefix= "arXiv",
  eprint       = "1405.4980",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Computational Complexity, Computer Science - Learning,
                  Computer Science - Numerical Analysis, Statistics - Machine
                  Learning",
  month        = may,
  primaryClass = "math.OC"
}

@ARTICLE{gu-2017-non-autor,
  author       = "{Gu}, J. and {Bradbury}, J. and {Xiong}, C. and {Li},
                  V.~O.~K. and {Socher}, R.",
  title        = "{Non-Autoregressive Neural Machine Translation}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171102281G",
  archivePrefix= "arXiv",
  eprint       = "1711.02281",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = nov,
  primaryClass = "cs.CL"
}

@article{kalman-1996-singularly-valuable,
  title        = {A singularly valuable decomposition: the SVD of a matrix},
  author       = {Kalman, Dan},
  journal      = "",
  year         = 1996
}

@ARTICLE{niu-2011-hogwil,
  author       = "{Niu}, F. and {Recht}, B. and {Re}, C. and {Wright}, S.~J.",
  title        = "{HOGWILD!: A Lock-Free Approach To Parallelizing Stochastic
                  Gradient Descent}",
  journal      = "ArXiv e-prints",
  year         = 2011,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2011arXiv1106.5730N",
  archivePrefix= "arXiv",
  eprint       = "1106.5730",
  keywords     = "Mathematics - Optimization and Control, Computer Science -
                  Learning",
  month        = jun,
  primaryClass = "math.OC"
}

@ARTICLE{theis-2015-note-evaluat-gener,
  author       = "{Theis}, L. and {van den Oord}, A. and {Bethge}, M.",
  title        = "{A Note on the Evaluation of Generative models}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv151101844T",
  archivePrefix= "arXiv",
  eprint       = "1511.01844",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  month        = nov,
  primaryClass = "stat.ML"
}

@ARTICLE{sutherland-2016-gener-model,
  author       = "{Sutherland}, D.~J. and {Tung}, H.-Y. and {Strathmann},
                  H. and {De}, S. and {Ramdas}, A. and {Smola}, A. and
                  {Gretton}, A.",
  title        = "{Generative Models and Model Criticism Via Optimized Maximum
                  Mean Discrepancy}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv161104488S",
  archivePrefix= "arXiv",
  eprint       = "1611.04488",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Learning, Computer Science -
                  Neural and Evolutionary Computing, Statistics - Methodology",
  month        = nov,
  primaryClass = "stat.ML"
}

@ARTICLE{yang-2016-multi-task,
  author       = "{Yang}, Z. and {Salakhutdinov}, R. and {Cohen}, W.",
  title        = "{Multi-Task Cross-Lingual Sequence Tagging From Scratch}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160306270Y",
  archivePrefix= "arXiv",
  eprint       = "1603.06270",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning",
  month        = mar,
  primaryClass = "cs.CL"
}

@ARTICLE{dhingra-2016-tweet,
  author       = "{Dhingra}, B. and {Zhou}, Z. and {Fitzpatrick}, D. and
                  {Muehl}, M. and {Cohen}, W.~W.",
  title        = "{Tweet2Vec: Character-Based Distributed Representations for
                  Social Media}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160503481D",
  archivePrefix= "arXiv",
  eprint       = "1605.03481",
  keywords     = "Computer Science - Learning, Computer Science - Computation
                  and Language",
  month        = may,
  primaryClass = "cs.LG"
}

@inproceedings{coates-2011-text-detection,
  title        = "Text detection and character recognition in scene images with
                  unsupervised feature learning",
  author       = "Coates, Adam and Carpenter, Blake and Case, Carl and
                  Satheesh, Sanjeev and Suresh, Bipin and Wang, Tao and Wu,
                  David J and Ng, Andrew Y",
  booktitle    = "Document Analysis and Recognition (ICDAR), 2011 International
                  Conference on",
  pages        = "440-445",
  year         = 2011,
  organization = "IEEE"
}

@inproceedings{zhang-2013-hmsearch,
  title        = "Hmsearch: An efficient hamming distance query processing
                  algorithm",
  author       = "Zhang, Xiaoyang and Qin, Jianbin and Wang, Wei and Sun,
                  Yifang and Lu, Jiaheng",
  booktitle    = "Proceedings of the 25th International Conference on
                  Scientific and Statistical Database Management",
  pages        = 19,
  year         = 2013,
  organization = "ACM"
}

@misc{mueen-2017-fastest-similarity,
  title        = "The Fastest Similarity Search Algorithm for Time Series
                  Subsequences under Euclidean Distance",
  author       = "Mueen, Abdullah and Zhu, Yan and Yeh, Michael and Kamgar,
                  Kaveh and Viswanathan, Krishnamurthy and Gupta, Chetan and
                  Keogh, Eamonn",
  year         = 2017,
  month        = "August",
  note         =
  "\url{http://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html}"
}

@article{hyyro-2001-explaining-extending,
  title        = {Explaining and extending the bit-parallel approximate string
                  matching algorithm of Myers},
  author       = {Hyyr{\"o}, Heikki},
  year         = 2001,
  publisher    = {Citeseer},
  journal      = ""
}

@inproceedings{askitis-2007-hat-trie,
  title        = "HAT-trie: a cache-conscious trie-based data structure for
                  strings",
  author       = "Askitis, Nikolas and Sinha, Ranjan",
  booktitle    = "Proceedings of the thirtieth Australasian conference on
                  Computer science-Volume 62",
  pages        = "97-105",
  year         = 2007,
  organization = "Australian Computer Society, Inc."
}

@techreport{bagwell-2001-ideal-hash-trees,
  title        = {Ideal hash trees},
  author       = {Bagwell, Phil},
  year         = 2001,
  institution  = ""
}

@article{van-2014-accelerating-t-sne,
  title        = "Accelerating t-SNE using tree-based algorithms.",
  author       = "Van Der Maaten, Laurens",
  journal      = "Journal of machine learning research",
  volume       = 15,
  number       = 1,
  pages        = "3221-3245",
  year         = 2014
}

@article{tibshirani-2001-estimating-number,
  title        = "Estimating the number of clusters in a data set via the gap
                  statistic",
  author       = "Tibshirani, Robert and Walther, Guenther and Hastie, Trevor",
  journal      = "Journal of the Royal Statistical Society: Series B
                  (Statistical Methodology)",
  volume       = 63,
  number       = 2,
  pages        = "411-423",
  year         = 2001,
  publisher    = "Wiley Online Library"
}

@article{schmidhuber-1992-learning-factorial-codes,
  title        = "Learning factorial codes by predictability minimization",
  author       = "Schmidhuber, J{\"u}rgen",
  journal      = "Neural Computation",
  volume       = 4,
  number       = 6,
  pages        = "863-879",
  year         = 1992,
  publisher    = "MIT Press"
}

@article{maaten-2008-visualizing-data,
  title        = "Visualizing data using t-SNE",
  author       = "Maaten, Laurens van der and Hinton, Geoffrey",
  journal      = "Journal of machine learning research",
  volume       = 9,
  number       = "Nov",
  pages        = "2579-2605",
  year         = 2008
}

@ARTICLE{kingma-2013-auto-encoding,
  author       = "{Kingma}, D.~P and {Welling}, M.",
  title        = "{Auto-Encoding Variational Bayes}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1312.6114",
  primaryClass = "stat.ML",
  keywords     = "Statistics - Machine Learning, Computer Science - Learning",
  year         = 2013,
  month        = dec,
  adsurl       = "http://adsabs.harvard.edu/abs/2013arXiv1312.6114K",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{rezende-2014-stochastic-backpropagation,
  author       = "{Jimenez Rezende}, D. and {Mohamed}, S. and {Wierstra}, D.",
  title        = "{Stochastic Backpropagation and Approximate Inference in Deep
                  Generative Models}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1401.4082",
  primaryClass = "stat.ML",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Learning, Statistics -
                  Computation, Statistics - Methodology",
  year         = 2014,
  month        = jan,
  adsurl       = "http://adsabs.harvard.edu/abs/2014arXiv1401.4082J",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{roberts-2017-cross-validation,
  title        = "Cross-validation strategies for data with temporal, spatial,
                  hierarchical, or phylogenetic structure",
  author       = "Roberts, David R and Bahn, Volker and Ciuti, Simone and
                  Boyce, Mark S and Elith, Jane and Guillera-Arroita, Gurutzeta
                  and Hauenstein, Severin and Lahoz-Monfort, Jos{\'e} J and
                  Schr{\"o}der, Boris and Thuiller, Wilfried and others",
  journal      = "Ecography",
  volume       = 40,
  number       = 8,
  pages        = "913-929",
  year         = 2017,
  publisher    = "Wiley Online Library"
}

@article{zhang-2004-optimality-navie,
  title        = "The optimality of naive Bayes",
  author       = "Zhang, Harry",
  journal      = "AA",
  volume       = 1,
  number       = 2,
  pages        = 3,
  year         = 2004
}

@inproceedings{zheng-2013-deep-learning,
  title        = "Deep learning for Chinese word segmentation and POS tagging",
  author       = "Zheng, Xiaoqing and Chen, Hanyang and Xu, Tianyu",
  booktitle    = "Proceedings of the 2013 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "647-657",
  year         = 2013
}

@article{黄昌宁-2007-中文分词十年回顾,
  title        = "中文分词十年回顾",
  author       = "黄昌宁 and 赵海 and others",
  journal      = "中文信息学报",
  volume       = 21,
  number       = 3,
  pages        = "8-19",
  year         = 2007
}

@article{张博-2006-对互联网环境下中文分词系统的一种架构改进,
  title        = "对互联网环境下中文分词系统的一种架构改进",
  author       = "张博 and 姜建国 and 万平国",
  journal      = "计算机应用研究",
  volume       = 11,
  pages        = "176-178",
  year         = 2006
}

@phdthesis{孙茂松-2001-汉语自动分词研究评述,
  title        = {汉语自动分词研究评述},
  author       = {孙茂松 and 邹嘉彦},
  school       = "",
  year         = 2001
}

@article{赵伟-2004-一种规则与统计相结合的汉语分词方法,
  title        = "一种规则与统计相结合的汉语分词方法",
  author       = "赵伟 and 戴新宇 and 尹存燕 and 陈家骏",
  journal      = "计算机应用研究",
  volume       = 21,
  number       = 3,
  pages        = "23-25",
  year         = 2004
}

@article{张华平-2004-基于角色标注的中国人名自动识别研究,
  title        = "基于角色标注的中国人名自动识别研究",
  author       = "张华平 and 刘群",
  journal      = "计算机学报",
  volume       = 27,
  number       = 1,
  year         = 2004
}

@article{孙宾-2003-现代汉语文本的词语切分技术,
  title        = "现代汉语文本的词语切分技术",
  author       = "孙宾",
  journal      = "技术报告, 北京大学计算语言学研究所",
  year         = 2003
}

@article{foo-2004-chinese-word,
  title        = "Chinese word segmentation and its effect on information
                  retrieval",
  author       = "Foo, Schubert and Li, Hui",
  journal      = "Information processing \& management",
  volume       = 40,
  number       = 1,
  pages        = "161-190",
  year         = 2004,
  publisher    = "Elsevier"
}

@inproceedings{peng-2004-chinese-segmentation,
  title        = "Chinese segmentation and new word detection using conditional
                  random fields",
  author       = "Peng, Fuchun and Feng, Fangfang and McCallum, Andrew",
  booktitle    = "Proceedings of the 20th international conference on
                  Computational Linguistics",
  pages        = 562,
  year         = 2004,
  organization = "Association for Computational Linguistics"
}

@article{huang-2003-applying-machine,
  title        = "Applying machine learning to text segmentation for
                  information retrieval",
  author       = "Huang, Xiangji and Peng, Fuchun and Schuurmans, Dale and
                  Cercone, Nick and Robertson, Stephen E",
  journal      = "Information Retrieval",
  volume       = 6,
  number       = "3-4",
  pages        = "333-362",
  year         = 2003,
  publisher    = "Springer"
}

@inproceedings{jiang-2009-automatic-adaptation,
  title        = "Automatic adaptation of annotation standards: Chinese word
                  segmentation and POS tagging: a case study",
  author       = "Jiang, Wenbin and Huang, Liang and Liu, Qun",
  booktitle    = "Proceedings of the Joint Conference of the 47th Annual
                  Meeting of the ACL and the 4th International Joint Conference
                  on Natural Language Processing of the AFNLP: Volume 1-Volume
                  1",
  pages        = "522-530",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{sun-1998-chinese-word,
  title        = "Chinese word segmentation without using lexicon and
                  hand-crafted training data",
  author       = "Maosong, Sun and Dayang, Shen and Tsou, Benjamin K",
  booktitle    = "Proceedings of the 36th Annual Meeting of the Association for
                  Computational Linguistics and 17th International Conference
                  on Computational Linguistics-Volume 2",
  pages        = "1265-1271",
  year         = 1998,
  organization = "Association for Computational Linguistics"
}

@article{俞士汶-2002-北京大学现代汉语语料库基本加工规范,
  title        = "北京大学现代汉语语料库基本加工规范 (续)",
  author       = "俞士汶 and 段慧明 and 朱学锋 and 孙斌",
  journal      = "中文信息学报",
  volume       = 16,
  number       = 6,
  pages        = "59-65",
  year         = 2002
}

@article{宋柔-1997-关于分词规范的探讨,
  title        = "关于分词规范的探讨",
  author       = "宋柔",
  journal      = "语言文字应用",
  number       = 3,
  pages        = "113-114",
  year         = 1997
}

@article{孙茂松-2001-信息处理用词汇研究,
  title        = "信息处理用现代汉语分词词表",
  author       = "孙茂松 and 王洪君 and 李行健 and 富丽 and 黄昌宁 and 陈松岑
                  and谢自立 and 张卫国",
  journal      = "语言文字应用",
  number       = 4,
  pages        = "84-89",
  year         = 2001
}

@article{李玉梅-2007-分词规范亟需补充的三方面内容,
  title        = "分词规范亟需补充的三方面内容",
  author       = "李玉梅 and 陈晓 and 姜自霞 and 易江燕 and 靳光瑾 and 黄昌宁",
  journal      = "中文信息学报",
  volume       = 21,
  number       = 5,
  pages        = "1-7",
  year         = 2007
}

@article{刘荣-2011-利用统计量和语言学规则提取多字词表达,
  title        = "利用统计量和语言学规则提取多字词表达",
  author       = "刘荣 and 王奕凯",
  journal      = "太原理工大學學報",
  volume       = 42,
  number       = 2,
  pages        = "133-137",
  year         = 2011,
  publisher    = "太原理工大學學報編輯部"
}

@inproceedings{zhao-2017-ngram2vec,
  title        = "Ngram2vec: Learning Improved Word Representations from Ngram
                  Co-occurrence Statistics",
  author       = "Zhao, Zhe and Liu, Tao and Li, Shen and Li, Bofang and Du,
                  Xiaoyong",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "244-253",
  year         = 2017
}

@ARTICLE{pawar-2017-relation-extraction,
  author       = "{Pawar}, S. and {Palshikar}, G.~K. and {Bhattacharyya}, P.",
  title        = "{Relation Extraction : A Survey}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1712.05191",
  primaryClass = "cs.CL",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Information Retrieval",
  year         = 2017,
  month        = dec,
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171205191P",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{mintz-2009-distant-supervision,
  title        = "Distant supervision for relation extraction without labeled
                  data",
  author       = "Mintz, Mike and Bills, Steven and Snow, Rion and Jurafsky,
                  Dan",
  booktitle    = "Proceedings of the Joint Conference of the 47th Annual
                  Meeting of the ACL and the 4th International Joint Conference
                  on Natural Language Processing of the AFNLP: Volume 2-Volume
                  2",
  pages        = "1003-1011",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@article{王丽杰-2009-基于SVMTool的中文词性标注,
  author       = "王丽杰 and 车万翔 and 刘挺",
  title        = "基于SVMTool的中文词性标注",
  publisher    = "中文信息学报",
  year         = 2009,
  journal      = "中文信息学报",
  volume       = 23,
  number       = 4,
  eid          = 16,
  numpages     = 6,
  pages        = 16,
  keywords     = "计算机应用;中文信息处理;词性标注;SVMTool;未登录词;偏旁部首",
  url          = "http://jcip.cipsc.org.cn/CN/abstract/article_1212.shtml"
}

@ARTICLE{sutton-2010-intro-cond,
  author       = "{Sutton}, C. and {McCallum}, A.",
  title        = "{An Introduction to Conditional Random Fields}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1011.4088",
  primaryClass = "stat.ML",
  keywords     = "Statistics - Machine Learning",
  year         = 2010,
  month        = nov,
  adsurl       = "http://adsabs.harvard.edu/abs/2010arXiv1011.4088S",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{lafferty-2001-cond-rand,
  author       = "Lafferty, John D. and McCallum, Andrew and Pereira, Fernando
                  C. N.",
  title        = "Conditional Random Fields: Probabilistic Models for
                  Segmenting and Labeling Sequence Data",
  booktitle    = "Proceedings of the Eighteenth International Conference on
                  Machine Learning",
  series       = "ICML '01",
  year         = 2001,
  isbn         = "1-55860-778-1",
  pages        = "282-289",
  numpages     = 8,
  url          = "http://dl.acm.org/citation.cfm?id=645530.655813",
  acmid        = 655813,
  publisher    = "Morgan Kaufmann Publishers Inc.",
  address      = "San Francisco, CA, USA"
}

@inproceedings{sha-2003-shallow-parsing,
  author       = "Sha, Fei and Pereira, Fernando",
  title        = "Shallow Parsing with Conditional Random Fields",
  booktitle    = "Proceedings of the 2003 Conference of the North American
                  Chapter of the Association for Computational Linguistics on
                  Human Language Technology - Volume 1",
  series       = "NAACL '03",
  year         = 2003,
  location     = "Edmonton, Canada",
  pages        = "134-141",
  numpages     = 8,
  url          = "https://doi.org/10.3115/1073445.1073473",
  doi          = "10.3115/1073445.1073473",
  acmid        = 1073473,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@article{刘炜-2017-一种面向突发事件的文本语料自动标注方法,
  author       = "刘炜 and 王旭 and 张雨嘉 and 刘宗田",
  title        = "一种面向突发事件的文本语料自动标注方法",
  publisher    = "中文信息学报",
  year         = 2017,
  journal      = "中文信息学报",
  volume       = 31,
  number       = 2,
  eid          = 76,
  numpages     = 9,
  pages        = 76,
  keywords     = "突发事件;语料库;自动标注",
  url          = "http://jcip.cipsc.org.cn/CN/abstract/article_2360.shtml"
}

@ARTICLE{huang-2015-bidirect-lstm-crf,
  author       = "{Huang}, Z. and {Xu}, W. and {Yu}, K.",
  title        = "{Bidirectional LSTM-CRF Models for Sequence Tagging}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1508.01991",
  primaryClass = "cs.CL",
  keywords     = "Computer Science - Computation and Language",
  year         = 2015,
  month        = aug,
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150801991H",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{askitis-2005-cache-conscious,
  title        = "Cache-conscious collision resolution in string hash tables",
  author       = "Askitis, Nikolas and Zobel, Justin",
  booktitle    = "International Symposium on String Processing and Information
                  Retrieval",
  pages        = "91-102",
  year         = 2005,
  organization = "Springer"
}

@article{王厚峰-2002-指代消解的基本方法和实现技术,
  title        = "指代消解的基本方法和实现技术",
  author       = "王厚峰",
  journal      = "中文信息学报",
  volume       = 16,
  number       = 6,
  pages        = "10-18",
  year         = 2002
}

@inproceedings{ahn-2006-stage-event,
  author       = "Ahn, David",
  title        = "The Stages of Event Extraction",
  booktitle    = "Proceedings of the Workshop on Annotating and Reasoning About
                  Time and Events",
  series       = "ARTE '06",
  year         = 2006,
  isbn         = "1-932432-81-7",
  location     = "Sydney, Australia",
  pages        = "1-8",
  numpages     = 8,
  url          = "http://dl.acm.org/citation.cfm?id=1629235.1629236",
  acmid        = 1629236,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@article{赵妍妍-2008-中文事件抽取技术研究,
  title        = "中文事件抽取技术研究",
  author       = "赵妍妍 and 秦兵 and 车万翔 and 刘挺",
  journal      = "中文信息学报",
  volume       = 22,
  number       = 1,
  pages        = "3-8",
  year         = 2008
}

@article{李颖-2017-中文开放式多元实体关系抽取,
  title        = "中文开放式多元实体关系抽取",
  author       = "李颖 and 郝晓燕 and 王勇",
  journal      = "计算机科学",
  number       = "S1",
  pages        = "80-83",
  year         = 2017
}

@inproceedings{takamatsu-2012-reducing-wrong,
  author       = "Takamatsu, Shingo and Sato, Issei and Nakagawa, Hiroshi",
  title        = "Reducing Wrong Labels in Distant Supervision for Relation
                  Extraction",
  booktitle    = "Proceedings of the 50th Annual Meeting of the Association for
                  Computational Linguistics: Long Papers - Volume 1",
  series       = "ACL '12",
  year         = 2012,
  location     = "Jeju Island, Korea",
  pages        = "721-729",
  numpages     = 9,
  url          = "http://dl.acm.org/citation.cfm?id=2390524.2390626",
  acmid        = 2390626,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@inproceedings{yao-2010-collective-cross,
  title        = "Collective cross-document relation extraction without
                  labelled data",
  author       = "Yao, Limin and Riedel, Sebastian and McCallum, Andrew",
  booktitle    = "Proceedings of the 2010 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1013-1023",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@inproceedings{berant-2013-semantic-parsing,
  title        = "Semantic parsing on freebase from question-answer pairs",
  author       = "Berant, Jonathan and Chou, Andrew and Frostig, Roy and Liang,
                  Percy",
  booktitle    = "Proceedings of the 2013 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1533-1544",
  year         = 2013
}

@inproceedings{hoffmann-2011-knowledge-based,
  title        = "Knowledge-based weak supervision for information extraction
                  of overlapping relations",
  author       = "Hoffmann, Raphael and Zhang, Congle and Ling, Xiao and
                  Zettlemoyer, Luke and Weld, Daniel S",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies-Volume
                  1",
  pages        = "541-550",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@inproceedings{surdeanu-2012-multi-instance,
  title        = "Multi-instance multi-label learning for relation extraction",
  author       = "Surdeanu, Mihai and Tibshirani, Julie and Nallapati, Ramesh
                  and Manning, Christopher D",
  booktitle    = "Proceedings of the 2012 joint conference on empirical methods
                  in natural language processing and computational natural
                  language learning",
  pages        = "455-465",
  year         = 2012,
  organization = "Association for Computational Linguistics"
}

@InProceedings{lin-2013-conv-neural,
  author       = "Liu, ChunYang and Sun, WenBo and Chao, WenHan and Che,
                  WanXiang",
  editor       = "Motoda, Hiroshi and Wu, Zhaohui and Cao, Longbing and Zaiane,
                  Osmar and Yao, Min and Wang, Wei",
  title        = "Convolution Neural Network for Relation Extraction",
  booktitle    = "Advanced Data Mining and Applications",
  year         = 2013,
  publisher    = "Springer Berlin Heidelberg",
  address      = "Berlin, Heidelberg",
  pages        = "231-242",
  abstract     = "Deep Neural Network has been applied to many Natural Language
                  Processing tasks. Instead of building hand-craft features,
                  DNN builds features by automatic learning, fitting different
                  domains well. In this paper, we propose a novel convolution
                  network, incorporating lexical features, applied to Relation
                  Extraction. Since many current deep neural networks use word
                  embedding by word table, which, however, neglects semantic
                  meaning among words, we import a new coding method, which
                  coding input words by synonym dictionary to integrate
                  semantic knowledge into the neural network. We compared our
                  Convolution Neural Network (CNN) on relation extraction with
                  the state-of-art tree kernel approach, including Typed
                  Dependency Path Kernel and Shortest Dependency Path Kernel
                  and Context-Sensitive tree kernel, resulting in a 9{\%}
                  improvement competitive performance on ACE2005 data
                  set. Also, we compared the synonym coding with the one-hot
                  coding, and our approach got 1.6{\%} improvement. Moreover,
                  we also tried other coding method, such as hypernym coding,
                  and give some discussion according the result.",
  isbn         = "978-3-642-53917-6"
}

@inproceedings{zeng-2014-relation-classification,
  title        = "Relation classification via convolutional deep neural
  network",
  author       = "Zeng, Daojian and Liu, Kang and Lai, Siwei and Zhou, Guangyou
                  and Zhao, Jun",
  booktitle    = "Proceedings of COLING 2014, the 25th International Conference
                  on Computational Linguistics: Technical Papers",
  pages        = "2335-2344",
  year         = 2014
}

@inproceedings{nguyen-2015-relation-extraction,
  title        = "Relation extraction: Perspective from convolutional neural
                  networks",
  author       = "Nguyen, Thien Huu and Grishman, Ralph",
  booktitle    = "Proceedings of the 1st Workshop on Vector Space Modeling for
                  Natural Language Processing",
  pages        = "39-48",
  year         = 2015
}

@ARTICLE{nogueira-2015-class-relat,
  author       = "{Nogueira dos Santos}, C. and {Xiang}, B. and {Zhou}, B.",
  title        = "{Classifying Relations By Ranking With Convolutional Neural
                  Networks}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150406580N",
  archivePrefix= "arXiv",
  eprint       = "1504.06580",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing",
  month        = apr,
  primaryClass = "cs.CL"
}

@ARTICLE{zhang-2015-relat-class,
  author       = "{Zhang}, D. and {Wang}, D.",
  title        = "{Relation Classification Via Recurrent Neural Network}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150801006Z",
  archivePrefix= "arXiv",
  eprint       = "1508.01006",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing",
  month        = aug,
  primaryClass = "cs.CL"
}

@inproceedings{zhou-2016-attention-based,
  title        = "Attention-based bidirectional long short-term memory networks
                  for relation classification",
  author       = "Zhou, Peng and Shi, Wei and Tian, Jun and Qi, Zhenyu and Li,
                  Bingchen and Hao, Hongwei and Xu, Bo",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 2: Short Papers)",
  volume       = 2,
  pages        = "207-212",
  year         = 2016
}

@inproceedings{wang-2016-relation-classification,
  title        = "Relation classification via multi-level attention cnns",
  author       = "Wang, Linlin and Cao, Zhu and de Melo, Gerard and Liu,
                  Zhiyuan",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "1298-1307",
  year         = 2016
}

@inproceedings{zeng-2015-distant-supervision,
  title        = "Distant supervision for relation extraction via piecewise
                  convolutional neural networks",
  author       = "Zeng, Daojian and Liu, Kang and Chen, Yubo and Zhao, Jun",
  booktitle    = "Proceedings of the 2015 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1753-1762",
  year         = 2015
}

@inproceedings{jiang-2016-relation-extraction,
  title        = "Relation extraction with multi-instance multi-label
                  convolutional neural networks",
  author       = "Jiang, Xiaotian and Wang, Quan and Li, Peng and Wang, Bin",
  booktitle    = "Proceedings of COLING 2016, the 26th International Conference
                  on Computational Linguistics: Technical Papers",
  pages        = "1471-1480",
  year         = 2016
}

@inproceedings{ji-2017-distan-super,
  title        = "Distant Supervision for Relation Extraction with
                  Sentence-Level Attention and Entity Descriptions",
  author       = "Guoliang Ji and Kang Liu and Shizhu He and Jun Zhao",
  booktitle    = "AAAI",
  year         = 2017
}

@article{漆桂林-2017-知识图谱研究进展,
  title        = "知识图谱研究进展",
  author       = "漆桂林 and 高桓 and 吴天星",
  journal      = "情报工程",
  volume       = 3,
  number       = 1,
  pages        = "4-25",
  year         = 2017
}

@inproceedings{brin-1998-extrac,
  title        = "Extracting patterns and relations from the world wide web",
  author       = "Brin, Sergey",
  booktitle    = "International Workshop on The World Wide Web and Databases",
  pages        = "172-183",
  year         = 1998,
  organization = "Springer"
}

@inproceedings{agichtein-2000-snowball,
  title        = "Snowball: Extracting relations from large plain-text
                  collections",
  author       = "Agichtein, Eugene and Gravano, Luis",
  booktitle    = "Proceedings of the fifth ACM conference on Digital libraries",
  pages        = "85-94",
  year         = 2000,
  organization = "ACM"
}

@inproceedings{yates-2007-textrunner,
  title        = "Textrunner: open information extraction on the web",
  author       = "Yates, Alexander and Cafarella, Michael and Banko, Michele
                  and Etzioni, Oren and Broadhead, Matthew and Soderland,
                  Stephen",
  booktitle    = "Proceedings of Human Language Technologies: The Annual
                  Conference of the North American Chapter of the Association
                  for Computational Linguistics: Demonstrations",
  pages        = "25-26",
  year         = 2007,
  organization = "Association for Computational Linguistics"
}

@inproceedings{bollegala-2009-measur-simil,
  author       = "Bollegala, Danushka T. and Matsuo, Yutaka and Ishizuka,
                  Mitsuru",
  title        = "Measuring the Similarity Between Implicit Semantic Relations
                  from the Web",
  booktitle    = "Proceedings of the 18th International Conference on World
                  Wide Web",
  series       = "WWW '09",
  year         = 2009,
  isbn         = "978-1-60558-487-4",
  location     = "Madrid, Spain",
  pages        = "651-660",
  numpages     = 10,
  url          = "http://doi.acm.org/10.1145/1526709.1526797",
  doi          = "10.1145/1526709.1526797",
  acmid        = 1526797,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "natural language processing, relational similarity, web
                  mining"
}

@inproceedings{bollegala-2010-relat-dualit,
  author       = "Bollegala, Danushka Tarupathi and Matsuo, Yutaka and
                  Ishizuka, Mitsuru",
  title        = "Relational Duality: Unsupervised Extraction of Semantic
                  Relations Between Entities on the Web",
  booktitle    = "Proceedings of the 19th International Conference on World
                  Wide Web",
  series       = "WWW '10",
  year         = 2010,
  isbn         = "978-1-60558-799-8",
  location     = "Raleigh, North Carolina, USA",
  pages        = "151-160",
  numpages     = 10,
  url          = "http://doi.acm.org/10.1145/1772690.1772707",
  doi          = "10.1145/1772690.1772707",
  acmid        = 1772707,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "relation extraction, relational duality, relational
                  similarity, web mining"
}

@inproceedings{batista-2015-semi-supervised,
  title        = "Semi-supervised bootstrapping of relationship extractors with
                  distributional semantics",
  author       = "Batista, David S and Martins, Bruno and Silva, M{\'a}rio J",
  booktitle    = "Proceedings of the 2015 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "499-504",
  year         = 2015
}

@inproceedings{zhu-2009-statsnowball,
  author       = "Zhu, Jun and Nie, Zaiqing and Liu, Xiaojiang and Zhang, Bo
                  and Wen, Ji-Rong",
  title        = "StatSnowball: A Statistical Approach to Extracting Entity
                  Relationships",
  booktitle    = "Proceedings of the 18th International Conference on World
                  Wide Web",
  series       = "WWW '09",
  year         = 2009,
  isbn         = "978-1-60558-487-4",
  location     = "Madrid, Spain",
  pages        = "101-110",
  numpages     = 10,
  url          = "http://doi.acm.org/10.1145/1526709.1526724",
  doi          = "10.1145/1526709.1526724",
  acmid        = 1526724,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "Markov logic networks, relationship extraction, statistical
                  models"
}

@article{车万翔-2005-实体关系自动抽取,
  title        = "实体关系自动抽取",
  author       = "车万翔 and 刘挺 and 李生",
  journal      = "中文信息学报",
  volume       = 19,
  number       = 2,
  pages        = "2-7",
  year         = 2005
}

@article{田久乐-2010-基于同义词词林的词语相似度计算方法,
  title        = "基于同义词词林的词语相似度计算方法",
  author       = "田久乐 and 赵蔚",
  journal      = "吉林大学学报: 信息科学版",
  number       = 6,
  pages        = "602-608",
  year         = 2010
}

@ARTICLE{ratner-2017-snorkel,
  author       = "{Ratner}, A. and {Bach}, S.~H. and {Ehrenberg}, H. and
                  {Fries}, J. and {Wu}, S. and {R{\'e}}, C.",
  title        = "{Snorkel: Rapid Training Data Creation with Weak
  Supervision}",
  journal      = "ArXiv e-prints",
  archivePrefix= "arXiv",
  eprint       = "1711.10160",
  primaryClass = "cs.LG",
  keywords     = "Computer Science - Learning, Statistics - Machine Learning",
  year         = 2017,
  month        = nov,
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv171110160R",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{bach-2017-learn-struc,
  author       = "{Bach}, S.~H. and {He}, B. and {Ratner}, A. and {R{\'e}}, C.",
  title        = "{Learning the Structure of Generative Models Without Labeled
                  Data}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170300854B",
  archivePrefix= "arXiv",
  eprint       = "1703.00854",
  keywords     = "Computer Science - Learning, Statistics - Machine Learning",
  month        = mar,
  primaryClass = "cs.LG"
}

@ARTICLE{ratner-2016-data-progr,
  author       = "{Ratner}, A. and {De Sa}, C. and {Wu}, S. and {Selsam},
                  D. and {R{\'e}}, C.",
  title        = "{Data Programming: Creating Large Training Sets, Quickly}",
  journal      = "ArXiv e-prints",
  year         = 2016,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160507723R",
  archivePrefix= "arXiv",
  eprint       = "1605.07723",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Learning",
  month        = may,
  primaryClass = "stat.ML"
}

@article{刘宗田-2009-面向事件的本体研究,
  title        = "面向事件的本体研究",
  author       = "刘宗田 and 黄美丽 and 周文 and 仲兆满 and 付剑锋 and 单建芳
                  and 智慧来",
  journal      = "计算机科学",
  volume       = 36,
  number       = 11,
  pages        = "189-192",
  year         = 2009
}

@article{pei-2004-mining,
  title        = "Mining sequential patterns by pattern-growth: The prefixspan
                  approach",
  author       = "Pei, Jian and Han, Jiawei and Mortazavi-Asl, Behzad and Wang,
                  Jianyong and Pinto, Helen and Chen, Qiming and Dayal,
                  Umeshwar and Hsu, Mei-Chun",
  journal      = "IEEE Transactions on knowledge and data engineering",
  volume       = 16,
  number       = 11,
  pages        = "1424-1440",
  year         = 2004,
  publisher    = "IEEE"
}

@article{李明耀-2016-基于依存分析的开放式中文实体关系抽取方法,
  title        = "基于依存分析的开放式中文实体关系抽取方法",
  author       = "李明耀 and 杨静",
  journal      = "计算机工程",
  volume       = 42,
  number       = 6,
  pages        = "201-207",
  year         = 2016
}

@inproceedings{ratinov-2009-design-challenges,
  title        = "Design challenges and misconceptions in named entity
                  recognition",
  author       = "Ratinov, Lev and Roth, Dan",
  booktitle    = "Proceedings of the Thirteenth Conference on Computational
                  Natural Language Learning",
  pages        = "147-155",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{dai-2015-enhan,
  title        = "Enhancing of chemical compound and drug name recognition
                  using representative tag scheme and fine-grained
                  tokenization",
  author       = "Hong-Jie Dai and Po-Ting Lai and Yung-Chun Chang and Richard
                  Tzong-Han Tsai",
  booktitle    = "J. Cheminformatics",
  year         = 2015
}

@ARTICLE{dyer-2015-stack-lstm,
  author       = "{Dyer}, C. and {Ballesteros}, M. and {Ling}, W. and
                  {Matthews}, A. and {Smith}, N.~A.",
  title        = "{Transition-Based Dependency Parsing With Stack Long
                  Short-Term Memory}",
  journal      = "ArXiv e-prints",
  year         = 2015,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2015arXiv150508075D",
  archivePrefix= "arXiv",
  eprint       = "1505.08075",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Learning, Computer Science - Neural and Evolutionary Computing",
  month        = may,
  primaryClass = "cs.CL"
}

@inproceedings{ling-2015-not-all,
  title        = "Not all contexts are created equal: Better word
                  representations with variable attention",
  author       = "Ling, Wang and Tsvetkov, Yulia and Amir, Silvio and
                  Fermandez, Ramon and Dyer, Chris and Black, Alan W and
                  Trancoso, Isabel and Lin, Chu-Cheng",
  booktitle    = "Proceedings of the 2015 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1367-1372",
  year         = 2015
}

@article{koppel-2006-importance-neutral,
  title        = "The importance of neutral examples for learning sentiment",
  author       = "Koppel, Moshe and Schler, Jonathan",
  journal      = "Computational Intelligence",
  volume       = 22,
  number       = 2,
  pages        = "100-109",
  year         = 2006,
  publisher    = "Wiley Online Library"
}

@article{berger-1996-maximum-entropy,
  title        = "A maximum entropy approach to natural language processing",
  author       = "Berger, Adam L and Pietra, Vincent J Della and Pietra,
                  Stephen A Della",
  journal      = "Computational linguistics",
  volume       = 22,
  number       = 1,
  pages        = "39-71",
  year         = 1996,
  publisher    = "MIT Press"
}

@ARTICLE{Prescher-2004-tutoral,
  author       = "{Prescher}, D.",
  title        = "{A Tutorial on the Expectation-Maximization Algorithm
                  Including Maximum-Likelihood Estimation and EM Training of
                  Probabilistic Context-Free Grammars}",
  journal      = "eprint arXiv:cs/0412015",
  eprint       = "cs/0412015",
  keywords     = "Computer Science - Computation and Language",
  year         = 2004,
  month        = dec,
  adsurl       = "http://adsabs.harvard.edu/abs/2004cs.......12015P",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{berger-1997-improved-iterative,
  title        = "The improved iterative scaling algorithm: A gentle
                  introduction",
  author       = "Berger, Adam",
  journal      = "Unpublished manuscript",
  year         = 1997
}

@inproceedings{curran-2003-inves-gis,
  author       = "Curran, James R. and Clark, Stephen",
  title        = "Investigating GIS and Smoothing for Maximum Entropy Taggers",
  booktitle    = "Proceedings of the Tenth Conference on European Chapter of
                  the Association for Computational Linguistics - Volume 1",
  series       = "EACL '03",
  year         = 2003,
  isbn         = "1-333-56789-0",
  location     = "Budapest, Hungary",
  pages        = "91-98",
  numpages     = 8,
  url          = "https://doi.org/10.3115/1067807.1067821",
  doi          = "10.3115/1067807.1067821",
  acmid        = 1067821,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@article{张华平-2002-基于N-最短路径方法的中文词语粗分模型,
  title        = "基于N-最短路径方法的中文词语粗分模型",
  author       = "张华平 and 刘群",
  journal      = "中文信息学报",
  volume       = 16,
  number       = 5,
  pages        = "3-9",
  year         = 2002
}

@article{秦兵-2015-无指导的中文开放式实体关系抽取,
  title        = "无指导的中文开放式实体关系抽取",
  author       = "秦兵 and 刘安安 and 刘挺 and others",
  journal      = "计算机研究与发展",
  volume       = 52,
  number       = 5,
  year         = 2015,
  pages        = "1029-1035"
}

@inproceedings{li-2013-joint-event,
  title        = "Joint event extraction via structured prediction with global
                  features",
  author       = "Li, Qi and Ji, Heng and Huang, Liang",
  booktitle    = "Proceedings of the 51st Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "73-82",
  year         = 2013
}

@article{chen-2012-joint-modeling,
  title        = "Joint modeling for chinese event extraction with rich
                  linguistic features",
  author       = "Chen, Chen and Ng, Vincent",
  journal      = "Proceedings of COLING 2012",
  pages        = "529-544",
  year         = 2012
}

@inproceedings{singh-2013-joint-infer,
  author       = "Singh, Sameer and Riedel, Sebastian and Martin, Brian and
                  Zheng, Jiaping and McCallum, Andrew",
  title        = "Joint Inference of Entities, Relations, and Coreference",
  booktitle    = "Proceedings of the 2013 Workshop on Automated Knowledge Base
                  Construction",
  series       = "AKBC '13",
  year         = 2013,
  isbn         = "978-1-4503-2411-3",
  location     = "San Francisco, California, USA",
  pages        = "1-6",
  numpages     = 6,
  url          = "http://doi.acm.org/10.1145/2509558.2509559",
  doi          = "10.1145/2509558.2509559",
  acmid        = 2509559,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "coreference resolution, information extraction, joint
                  inference, named entity recognition, relation extraction"
}

@inproceedings{riedel-2011-fast-robust,
  author       = "Riedel, Sebastian and McCallum, Andrew",
  title        = "Fast and Robust Joint Models for Biomedical Event Extraction",
  booktitle    = "Proceedings of the Conference on Empirical Methods in Natural
                  Language Processing",
  series       = "EMNLP '11",
  year         = 2011,
  isbn         = "978-1-937284-11-4",
  location     = "Edinburgh, United Kingdom",
  pages        = "1-12",
  numpages     = 12,
  url          = "http://dl.acm.org/citation.cfm?id=2145432.2145434",
  acmid        = 2145434,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@article{何馨宇-2017-基于双向LSTM和两阶段方法的触发词识别,
  author       = "何馨宇 and 李丽双",
  title        = "基于双向LSTM和两阶段方法的触发词识别",
  publisher    = "中文信息学报",
  year         = 2017,
  journal      = "中文信息学报",
  volume       = 31,
  number       = 6,
  eid          = 147,
  numpages     = 7,
  pages        = 147,
  keywords     = "触发词识别;两阶段方法;双向LSTM;依存词向量",
  url          = "http://jcip.cipsc.org.cn/CN/abstract/article_2482.shtml"
}

@ARTICLE{cai-2017-fast-accur,
  author       = "{Cai}, D. and {Zhao}, H. and {Zhang}, Z. and {Xin}, Y. and
                  {Wu}, Y. and {Huang}, F.",
  title        = "{Fast and Accurate Neural Word Segmentation for Chinese}",
  journal      = "ArXiv e-prints",
  year         = 2017,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2017arXiv170407047C",
  archivePrefix= "arXiv",
  eprint       = "1704.07047",
  keywords     = "Computer Science - Computation and Language",
  month        = apr,
  primaryClass = "cs.CL"
}

@article{陈自岩-2016-一种非监督的事件触发词检测和分类方法,
  title        = "一种非监督的事件触发词检测和分类方法",
  author       = "陈自岩 and 黄宇 and 王洋 and 傅兴玉 and 付琨",
  journal      = "国外电子测量技术",
  number       = 7,
  pages        = "91-95",
  year         = 2016
}

@inproceedings{li-2014-increm,
  title        = "Incremental joint extraction of entity mentions and
  relations",
  author       = "Li, Qi and Ji, Heng",
  booktitle    = "Proceedings of the 52nd Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "402-412",
  year         = 2014
}

@inproceedings{liu-2016-lever,
  title        = "Leveraging framenet to improve automatic event detection",
  author       = "Liu, Shulin and Chen, Yubo and He, Shizhu and Liu, Kang and
                  Zhao, Jun",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "2134-2143",
  year         = 2016
}

@article{ji-2008-refin,
  title        = "Refining event extraction through cross-document inference",
  author       = "Ji, Heng and Grishman, Ralph",
  journal      = "Proceedings of ACL-08: HLT",
  pages        = "254-262",
  year         = 2008
}

@inproceedings{chen-2009-language-specific,
  title        = "Language specific issue and feature exploration in Chinese
                  event extraction",
  author       = "Chen, Zheng and Ji, Heng",
  booktitle    = "Proceedings of Human Language Technologies: The 2009 Annual
                  Conference of the North American Chapter of the Association
                  for Computational Linguistics, Companion Volume: Short
                  Papers",
  pages        = "209-212",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{li-2012-employ-compositional,
  title        = "Employing compositional semantics and discourse consistency
                  in Chinese event extraction",
  author       = "Li, Peifeng and Zhou, Guodong and Zhu, Qiaoming and Hou,
                  Libin",
  booktitle    = "Proceedings of the 2012 Joint Conference on Empirical Methods
                  in Natural Language Processing and Computational Natural
                  Language Learning",
  pages        = "1006-1016",
  year         = 2012,
  organization = "Association for Computational Linguistics"
}

@inproceedings{liao-2010-using,
  title        = "Using document level cross-event inference to improve event
                  extraction",
  author       = "Liao, Shasha and Grishman, Ralph",
  booktitle    = "Proceedings of the 48th Annual Meeting of the Association for
                  Computational Linguistics",
  pages        = "789-797",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@inproceedings{hong-2011-using,
  title        = "Using cross-entity inference to improve event extraction",
  author       = "Hong, Yu and Zhang, Jianfeng and Ma, Bin and Yao, Jianmin and
                  Zhou, Guodong and Zhu, Qiaoming",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies-Volume
                  1",
  pages        = "1127-1136",
  year         = 2011,
  organization = "Association for Computational Linguistics"
}

@inproceedings{liu-2016-probab-soft,
  author       = "Liu, Shulin and Liu, Kang and He, Shizhu and Zhao, Jun",
  title        = "A Probabilistic Soft Logic Based Approach to Exploiting
                  Latent and Global Information in Event Classification",
  booktitle    = "Proceedings of the Thirtieth AAAI Conference on Artificial
                  Intelligence",
  series       = "AAAI'16",
  year         = 2016,
  location     = "Phoenix, Arizona",
  pages        = "2993-2999",
  numpages     = 7,
  url          = "http://dl.acm.org/citation.cfm?id=3016100.3016321",
  acmid        = 3016321,
  publisher    = "AAAI Press"
}

@article{kim-2000-subject-object,
  title        = "Subject/object drop in the acquisition of Korean: A
                  cross-linguistic comparison",
  author       = "Kim, Young-Joo",
  journal      = "Journal of East Asian Linguistics",
  volume       = 9,
  number       = 4,
  pages        = "325-351",
  year         = 2000,
  publisher    = "Springer"
}

@inproceedings{tan-2008-ident-chines,
  author       = "Tan, Hongye and Zhao, Tiejun and Zheng, Jiaheng",
  title        = "Identification of Chinese Event and Their Argument Roles",
  booktitle    = "Proceedings of the 2008 IEEE 8th International Conference on
                  Computer and Information Technology Workshops",
  series       = "CITWORKSHOPS '08",
  year         = 2008,
  isbn         = "978-0-7695-3242-4",
  pages        = "14-19",
  numpages     = 6,
  url          = "http://dx.doi.org/10.1109/CIT.2008.Workshops.54",
  doi          = "10.1109/CIT.2008.Workshops.54",
  acmid        = 1381056,
  publisher    = "IEEE Computer Society",
  address      = "Washington, DC, USA"
}

@inproceedings{fader-2013-parap,
  title        = "Paraphrase-driven learning for open question answering",
  author       = "Fader, Anthony and Zettlemoyer, Luke and Etzioni, Oren",
  booktitle    = "Proceedings of the 51st Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "1608-1618",
  year         = 2013
}

@article{陈箫箫-2016-微博中的开放域事件抽取,
  title        = "微博中的开放域事件抽取",
  author       = "陈箫箫 and 刘波",
  journal      = "计算机应用与软件",
  volume       = 33,
  number       = 8,
  pages        = "18-22",
  year         = 2016
}

@article{李江龙-2017-金融领域的事件句抽取,
  title        = "金融领域的事件句抽取",
  author       = "李江龙 and 吕学强 and 周建设 and 刘秀磊",
  journal      = "计算机应用研究",
  volume       = 34,
  number       = 10,
  pages        = "2915-2918",
  year         = 2017
}

@article{马晨曦-2018-基于递归神经网络的中文事件检测,
  title        = "基于递归神经网络的中文事件检测",
  author       = "马晨曦 and 陈兴蜀 and 王文贤 and 王海舟",
  journal      = "信息网络安全",
  number       = 5,
  pages        = "75-81",
  year         = 2018
}

@inproceedings{mcclosky-2011-event-extract,
  author       = "McClosky, David and Surdeanu, Mihai and Manning, Christopher
                  D.",
  title        = "Event Extraction As Dependency Parsing",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies -
                  Volume 1",
  series       = "HLT '11",
  year         = 2011,
  isbn         = "978-1-932432-87-9",
  location     = "Portland, Oregon",
  pages        = "1626-1635",
  numpages     = 10,
  url          = "http://dl.acm.org/citation.cfm?id=2002472.2002667",
  acmid        = 2002667,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@article{reschke-2014-event-extrac,
  title        = "Event Extraction Using Distant Supervision",
  author       = "Reschke, Kevin and Jankowiak, MarVn and Surdeanu, Mihai and
                  Manning, Christopher D and Jurafsky, Daniel",
  journal      = "Language",
  year         = 2014
}

@inproceedings{riedel-2013-relation-extraction,
  title        = "Relation extraction with matrix factorization and universal
                  schemas",
  author       = "Riedel, Sebastian and Yao, Limin and McCallum, Andrew and
                  Marlin, Benjamin M",
  booktitle    = "Proceedings of the 2013 Conference of the North American
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies",
  pages        = "74-84",
  year         = 2013
}

@inproceedings{toutanova-2015-representing-text,
  title        = "Representing text for joint embedding of text and knowledge
                  bases",
  author       = "Toutanova, Kristina and Chen, Danqi and Pantel, Patrick and
                  Poon, Hoifung and Choudhury, Pallavi and Gamon, Michael",
  booktitle    = "Proceedings of the 2015 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1499-1509",
  year         = 2015
}

@inproceedings{tang-2005-email,
  title        = "Email data cleaning",
  author       = "Tang, Jie and Li, Hang and Cao, Yunbo and Tang, Zhaohui",
  booktitle    = "Proceedings of the eleventh ACM SIGKDD international
                  conference on Knowledge discovery in data mining",
  pages        = "489-498",
  year         = 2005,
  organization = "ACM"
}

@inproceedings{smith-2007-tesser-ocr,
  title        = "An overview of the Tesseract OCR engine",
  author       = "Smith, Ray",
  booktitle    = "Document Analysis and Recognition, 2007. ICDAR 2007. Ninth
                  International Conference on",
  volume       = 2,
  pages        = "629-633",
  year         = 2007,
  organization = "IEEE"
}

@inproceedings{smith-2009-hybrid-page,
  title        = "Hybrid Page Layout Analysis via Tab-Stop Detection",
  author       = "Ray Smith",
  year         = 2009,
  URL          = "http://www.cvc.uab.es/icdar2009/papers/3725a241.pdf",
  booktitle    = "Proceedings of the 10th international conference on document
                  analysis and recognition"
}

@inproceedings{epshtein-2010-detect-text,
  title        = "Detecting text in natural scenes with stroke width transform",
  author       = "Epshtein, Boris and Ofek, Eyal and Wexler, Yonatan",
  booktitle    = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE
                  Conference on",
  pages        = "2963-2970",
  year         = 2010,
  organization = "IEEE"
}

@Article{ramakrishnan-2012-layout-pdf,
  author       = "Ramakrishnan, Cartic and Patnia, Abhishek and Hovy, Eduard
                  and Burns, Gully APC",
  title        = "Layout-aware text extraction from full-text PDF of scientific
                  articles",
  journal      = "Source Code for Biology and Medicine",
  year         = 2012,
  month        = "May",
  day          = 28,
  volume       = 7,
  number       = 1,
  pages        = 7,
  abstract     = "The Portable Document Format (PDF) is the most commonly used
                  file format for online scientific publications. The absence
                  of effective means to extract text from these PDF files in a
                  layout-aware manner presents a significant challenge for
                  developers of biomedical text mining or biocuration
                  informatics systems that use published literature as an
                  information source. In this paper we introduce the
                  `Layout-Aware PDF Text Extraction' (LA-PDFText) system to
                  facilitate accurate extraction of text from PDF files of
                  research articles for use in text mining applications.",
  issn         = "1751-0473",
  doi          = "10.1186/1751-0473-7-7",
  url          = "https://doi.org/10.1186/1751-0473-7-7"
}

@ARTICLE{niklaus-2018-survey-open-infor-extrac,
  author       = "{Niklaus}, C. and {Cetto}, M. and {Freitas}, A. and
                  {Handschuh}, S.",
  title        = "{A Survey on Open Information Extraction}",
  journal      = "ArXiv e-prints",
  year         = 2018,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2018arXiv180605599N",
  archivePrefix= "arXiv",
  eprint       = "1806.05599",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL"
}

@Article{nesterov-2015-univer,
  author       = "Nesterov, Yu",
  title        = "Universal gradient methods for convex optimization problems",
  journal      = "Mathematical Programming",
  year         = 2015,
  month        = "Aug",
  day          = 01,
  volume       = 152,
  number       = 1,
  pages        = "381-404",
  abstract     = "In this paper, we present new methods for black-box convex
                  minimization. They do not need to know in advance the actual
                  level of smoothness of the objective function. Their only
                  essential input parameter is the required accuracy of the
                  solution. At the same time, for each particular problem class
                  they automatically ensure the best possible rate of
                  convergence. We confirm our theoretical results by
                  encouraging numerical experiments, which demonstrate that the
                  fast rate of convergence, typical for the smooth optimization
                  problems, sometimes can be achieved even on nonsmooth problem
                  instances.",
  issn         = "1436-4646",
  doi          = "10.1007/s10107-014-0790-0",
  url          = "https://doi.org/10.1007/s10107-014-0790-0"
}

@inproceedings{nothman-2018-stop-word,
  title        = "Stop Word Lists in Free Open-source Software Packages",
  author       = "Nothman, Joel and Qin, Hanmin and Yurchak, Roman",
  booktitle    = "Proceedings of Workshop for NLP Open Source Software
                  (NLP-OSS)",
  pages        = "7-12",
  year         = 2018
}

@inproceedings{shi-2009-hash,
  title        = "Hash kernels",
  author       = "Shi, Qinfeng and Petterson, James and Dror, Gideon and
                  Langford, John and Smola, Alex and Strehl, Alex and
                  Vishwanathan, Vishy",
  booktitle    = "Artificial intelligence and statistics",
  pages        = "496-503",
  year         = 2009
}

@article{weinberger-2009-featur-hashin,
  author       = "Kilian Q. Weinberger and Anirban Dasgupta and Josh Attenberg
                  and John Langford and Alexander J. Smola",
  title        = "Feature Hashing for Large Scale Multitask Learning",
  journal      = "CoRR",
  volume       = "abs/0902.2206",
  year         = 2009,
  url          = "http://arxiv.org/abs/0902.2206",
  archivePrefix= "arXiv",
  eprint       = "0902.2206",
  timestamp    = "Mon, 13 Aug 2018 16:48:03 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-0902-2206",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{freksen-2018-fully-under-hashin-trick,
  author       = "Casper Benjamin Freksen and Lior Kamma and Kasper Green
                  Larsen",
  title        = "Fully Understanding the Hashing Trick",
  journal      = "CoRR",
  volume       = "abs/1805.08539",
  year         = 2018,
  url          = "http://arxiv.org/abs/1805.08539",
  archivePrefix= "arXiv",
  eprint       = "1805.08539",
  timestamp    = "Mon, 13 Aug 2018 16:49:00 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1805-08539",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{peters-2018-deep,
  author       = "Matthew E. Peters and Mark Neumann and Mohit Iyyer and Matt
                  Gardner and Christopher Clark and Kenton Lee and Luke
                  Zettlemoyer",
  title        = "Deep contextualized word representations",
  journal      = "CoRR",
  volume       = "abs/1802.05365",
  year         = 2018,
  url          = "http://arxiv.org/abs/1802.05365",
  archivePrefix= "arXiv",
  eprint       = "1802.05365",
  timestamp    = "Mon, 13 Aug 2018 16:48:54 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1802-05365",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{bengio-2003-neural-probab-languag-model,
  author       = "Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal
                  and Janvin, Christian",
  title        = "A Neural Probabilistic Language Model",
  journal      = "J. Mach. Learn. Res.",
  issue_date   = "3/1/2003",
  volume       = 3,
  month        = mar,
  year         = 2003,
  issn         = "1532-4435",
  pages        = "1137-1155",
  numpages     = 19,
  url          = "http://dl.acm.org/citation.cfm?id=944919.944966",
  acmid        = 944966,
  publisher    = "JMLR.org"
}


@article{devlin-2018-bert,
  author       = "Jacob Devlin and Ming{-}Wei Chang and Kenton Lee and Kristina
                  Toutanova",
  title        = "{BERT:} Pre-training of Deep Bidirectional Transformers for
                  Language Understanding",
  journal      = "CoRR",
  volume       = "abs/1810.04805",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.04805",
  archivePrefix= "arXiv",
  eprint       = "1810.04805",
  timestamp    = "Tue, 30 Oct 2018 20:39:56 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1810-04805",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{jozefowicz-2016-explor-limit-languag-model,
  author       = "Rafal J{\'{o}}zefowicz and Oriol Vinyals and Mike Schuster
                  and Noam Shazeer and Yonghui Wu",
  title        = "Exploring the Limits of Language Modeling",
  journal      = "CoRR",
  volume       = "abs/1602.02410",
  year         = 2016,
  url          = "http://arxiv.org/abs/1602.02410",
  archivePrefix= "arXiv",
  eprint       = "1602.02410",
  timestamp    = "Mon, 13 Aug 2018 16:48:43 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/JozefowiczVSSW16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{vania-2017-from-charac-words-between,
  author       = "Clara Vania and Adam Lopez",
  title        = "From Characters to Words to in Between: Do We Capture
                  Morphology?",
  journal      = "CoRR",
  volume       = "abs/1704.08352",
  year         = 2017,
  url          = "http://arxiv.org/abs/1704.08352",
  archivePrefix= "arXiv",
  eprint       = "1704.08352",
  timestamp    = "Mon, 13 Aug 2018 16:46:32 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/VaniaL17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{lei-2017-train-rnns-fast-cnns,
  author       = "Tao Lei and Yu Zhang and Yoav Artzi",
  title        = "Training RNNs as Fast as CNNs",
  journal      = "CoRR",
  volume       = "abs/1709.02755",
  year         = 2017,
  url          = "http://arxiv.org/abs/1709.02755",
  archivePrefix= "arXiv",
  eprint       = "1709.02755",
  timestamp    = "Mon, 13 Aug 2018 16:46:29 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1709-02755",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{tang-2018-why-self-atten,
  author       = "Gongbo Tang and Matthias M{\"{u}}ller and Annette Rios and
                  Rico Sennrich",
  title        = "Why Self-Attention? {A} Targeted Evaluation of Neural Machine
                  Translation Architectures",
  journal      = "CoRR",
  volume       = "abs/1808.08946",
  year         = 2018,
  url          = "http://arxiv.org/abs/1808.08946",
  archivePrefix= "arXiv",
  eprint       = "1808.08946",
  timestamp    = "Mon, 03 Sep 2018 07:29:38 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1808-08946",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@InProceedings{domhan-2018-how-much,
  author       = "Domhan, Tobias",
  title        = "How Much Attention Do You Need? A Granular Analysis of Neural
                  Machine Translation Architectures",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  year         = 2018,
  publisher    = "Association for Computational Linguistics",
  pages        = "1799-1808",
  location     = "Melbourne, Australia",
  url          = "http://aclweb.org/anthology/P18-1167"
}

@inproceedings{li-2015-word-embed-revis,
  author       = "Li, Yitan and Xu, Linli and Tian, Fei and Jiang, Liang and
                  Zhong, Xiaowei and Chen, Enhong",
  title        = "Word Embedding Revisited: A New Representation Learning and
                  Explicit Matrix Factorization Perspective",
  booktitle    = "Proceedings of the 24th International Conference on
                  Artificial Intelligence",
  series       = "IJCAI'15",
  year         = 2015,
  isbn         = "978-1-57735-738-4",
  location     = "Buenos Aires, Argentina",
  pages        = "3650-3656",
  numpages     = 7,
  url          = "http://dl.acm.org/citation.cfm?id=2832747.2832758",
  acmid        = 2832758,
  publisher    = "AAAI Press"
}

@article{evans-2000-frequen,
  title        = "Frequency versus probability formats in statistical word
                  problems",
  journal      = "Cognition",
  volume       = 77,
  number       = 3,
  pages        = "197-213",
  year         = 2000,
  issn         = "0010-0277",
  doi          = "https://doi.org/10.1016/S0010-0277(00)00098-6",
  url          =
  "http://www.sciencedirect.com/science/article/pii/S0010027700000986",
  author       = "Jonathan St.B.T Evans and Simon J Handley and Nick Perham and
                  David E Over and Valerie A Thompson",
  keywords     = "Frequency, Probability, Statistical word problems",
  abstract     = "Three experiments examined people's ability to incorporate
                  base rate information when judging posterior
                  probabilities. Specifically, we tested the (Cosmides, L., \&
                  Tooby, J. (1996). Are humans good intuitive statisticians
                  after all? Rethinking some conclusions from the literature on
                  judgement under uncertainty. Cognition, 58, 1–73) conclusion
                  that people's reasoning appears to follow Bayesian principles
                  when they are presented with information in a frequency
                  format, but not when information is presented as one case
                  probabilities. First, we found that frequency formats were
                  not generally associated with better performance than
                  probability formats unless they were presented in a manner
                  which facilitated construction of a set inclusion mental
                  model. Second, we demonstrated that the use of frequency
                  information may promote biases in the weighting of
                  information. When participants are asked to express their
                  judgements in frequency rather than probability format, they
                  were more likely to produce the base rate as their answer,
                  ignoring diagnostic evidence."
}

@article{griffin-1999-frequen-probab-predic,
  title        = "Frequency, Probability, and Prediction: Easy Solutions to
                  Cognitive Illusions?",
  journal      = "Cognitive Psychology",
  volume       = 38,
  number       = 1,
  pages        = "48-78",
  year         = 1999,
  issn         = "0010-0285",
  doi          = "https://doi.org/10.1006/cogp.1998.0707",
  url          =
  "http://www.sciencedirect.com/science/article/pii/S0010028598907071",
  author       = "Dale Griffin and Roger Buehler",
  abstract     = "Many errors in probabilistic judgment have been attributed to
                  people's inability to think in statistical terms when faced
                  with information about a single case. Prior theoretical
                  analyses and empirical results imply that the errors
                  associated with case-specific reasoning may be reduced when
                  people make frequentistic predictions about a set of
                  cases. In studies of three previously identified cognitive
                  biases, we find that frequency-based predictions are
                  different from—but no better than—case-specific judgments of
                  probability. First, in studies of the “planning fallacy,” we
                  compare the accuracy of aggregate frequency and case-specific
                  probability judgments in predictions of students' real-life
                  projects. When aggregate and single-case predictions are
                  collected from different respondents, there is little
                  difference between the two: Both are overly optimistic and
                  show little predictive validity. However, in within-subject
                  comparisons, the aggregate judgments are significantly more
                  conservative than the single-case predictions, though still
                  optimistically biased. Results from studies of overconfidence
                  in general knowledge and base rate neglect in categorical
                  prediction underline a general conclusion. Frequentistic
                  predictions made for sets of events are no more statistically
                  sophisticated, nor more accurate, than predictions made for
                  individual events using subjective probability."
}

@article{xie-2017-data-noisin,
  author       = "Ziang Xie and Sida I. Wang and Jiwei Li and Daniel L{\'{e}}vy
                  and Aiming Nie and Dan Jurafsky and Andrew Y. Ng",
  title        = "Data Noising as Smoothing in Neural Network Language Models",
  journal      = "CoRR",
  volume       = "abs/1703.02573",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.02573",
  archivePrefix= "arXiv",
  eprint       = "1703.02573",
  timestamp    = "Mon, 13 Aug 2018 16:47:17 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/XieWLLNJN17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{do-2008-what,
  title        = "What is the expectation maximization algorithm?",
  author       = "Do, Chuong B and Batzoglou, Serafim",
  journal      = "Nature biotechnology",
  volume       = 26,
  number       = 8,
  pages        = 897,
  year         = 2008,
  publisher    = "Nature Publishing Group"
}

@article{wei-2019-eda,
  author       = "Jason W. Wei and Kai Zou",
  title        = "{EDA:} Easy Data Augmentation Techniques for Boosting
                  Performance on Text Classification Tasks",
  journal      = "CoRR",
  volume       = "abs/1901.11196",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.11196",
  archivePrefix= "arXiv",
  eprint       = "1901.11196",
  timestamp    = "Mon, 04 Feb 2019 08:11:03 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1901-11196",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{sennrich-2015-improv,
  title        = "Improving neural machine translation models with monolingual
                  data",
  author       = "Sennrich, Rico and Haddow, Barry and Birch, Alexandra",
  journal      = "arXiv preprint arXiv:1511.06709",
  year         = 2015
}

@article{smith-2017-dont-decay,
  author       = "Samuel L. Smith and Pieter{-}Jan Kindermans and Quoc V. Le",
  title        = "Don't Decay the Learning Rate, Increase the Batch Size",
  journal      = "CoRR",
  volume       = "abs/1711.00489",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.00489",
  archivePrefix= "arXiv",
  eprint       = "1711.00489",
  timestamp    = "Mon, 13 Aug 2018 16:46:33 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1711-00489",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{su-2016-differ-equat,
  author       = "Weijie Su and Stephen Boyd and Emmanuel J. C and {{\`e}}s",
  title        = "A Differential Equation for Modeling Nesterov's Accelerated
                  Gradient Method: Theory and Insights",
  journal      = "Journal of Machine Learning Research",
  year         = 2016,
  volume       = 17,
  number       = 153,
  pages        = "1-43",
  url          = "http://jmlr.org/papers/v17/15-084.html"
}

@article{arora-2012-mwum,
  title        = "The multiplicative weights update method: a meta-algorithm
                  and applications",
  author       = "Arora, Sanjeev and Hazan, Elad and Kale, Satyen",
  journal      = "Theory of Computing",
  volume       = 8,
  number       = 1,
  pages        = "121-164",
  year         = 2012,
  publisher    = "Theory of Computing Exchange"
}

@article{li-2018-deep-reinf-learn,
  author       = "Yuxi Li",
  title        = "Deep Reinforcement Learning",
  journal      = "CoRR",
  volume       = "abs/1810.06339",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.06339",
  archivePrefix= "arXiv",
  eprint       = "1810.06339",
  timestamp    = "Tue, 30 Oct 2018 20:39:56 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1810-06339",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{balles-2017-follow-signs,
  author       = "Lukas Balles and Philipp Hennig",
  title        = "Follow the Signs for Robust Stochastic Optimization",
  journal      = "CoRR",
  volume       = "abs/1705.07774",
  year         = 2017,
  url          = "http://arxiv.org/abs/1705.07774",
  archivePrefix= "arXiv",
  eprint       = "1705.07774",
  timestamp    = "Mon, 13 Aug 2018 16:48:00 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/BallesH17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{sutskever-2013-impor-initial,
  author       = "Sutskever, Ilya and Martens, James and Dahl, George and
                  Hinton, Geoffrey",
  title        = "On the Importance of Initialization and Momentum in Deep
                  Learning",
  booktitle    = "Proceedings of the 30th International Conference on
                  International Conference on Machine Learning - Volume 28",
  series       = "ICML'13",
  year         = 2013,
  location     = "Atlanta, GA, USA",
  pages        = "III-1139--III-1147",
  url          = "http://dl.acm.org/citation.cfm?id=3042817.3043064",
  acmid        = 3043064,
  publisher    = "JMLR.org"
}

@article{inoue-2018-data-augmen,
  author       = "Hiroshi Inoue",
  title        = "Data Augmentation by Pairing Samples for Images
                  Classification",
  journal      = "CoRR",
  volume       = "abs/1801.02929",
  year         = 2018,
  url          = "http://arxiv.org/abs/1801.02929",
  archivePrefix= "arXiv",
  eprint       = "1801.02929",
  timestamp    = "Mon, 13 Aug 2018 16:46:20 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1801-02929",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{zhang-2017-mixup,
  author       = "Hongyi Zhang and Moustapha Ciss{\'{e}} and Yann N. Dauphin
                  and David Lopez{-}Paz",
  title        = "mixup: Beyond Empirical Risk Minimization",
  journal      = "CoRR",
  volume       = "abs/1710.09412",
  year         = 2017,
  url          = "http://arxiv.org/abs/1710.09412",
  archivePrefix= "arXiv",
  eprint       = "1710.09412",
  timestamp    = "Mon, 13 Aug 2018 16:47:14 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1710-09412",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{cubuk-2018-autoaugment,
  author       = "Ekin Dogus Cubuk and Barret Zoph and Dandelion Man{\'{e}} and
                  Vijay Vasudevan and Quoc V. Le",
  title        = "AutoAugment: Learning Augmentation Policies from Data",
  journal      = "CoRR",
  volume       = "abs/1805.09501",
  year         = 2018,
  url          = "http://arxiv.org/abs/1805.09501",
  archivePrefix= "arXiv",
  eprint       = "1805.09501",
  timestamp    = "Mon, 13 Aug 2018 16:48:44 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1805-09501",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{li-2018-under-dishar,
  author       = "Xiang Li and Shuo Chen and Xiaolin Hu and Jian Yang",
  title        = "Understanding the Disharmony between Dropout and Batch
                  Normalization by Variance Shift",
  journal      = "CoRR",
  volume       = "abs/1801.05134",
  year         = 2018,
  url          = "http://arxiv.org/abs/1801.05134",
  archivePrefix= "arXiv",
  eprint       = "1801.05134",
  timestamp    = "Fri, 21 Dec 2018 14:34:10 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1801-05134",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{bergstra-2012-random-searc-hyper-optim,
  author       = "Bergstra, James and Bengio, Yoshua",
  title        = "Random Search for Hyper-parameter Optimization",
  journal      = "J. Mach. Learn. Res.",
  issue_date   = "January 2012",
  volume       = 13,
  number       = 1,
  month        = feb,
  year         = 2012,
  issn         = "1532-4435",
  pages        = "281-305",
  numpages     = 25,
  url          = "http://dl.acm.org/citation.cfm?id=2503308.2188395",
  acmid        = 2188395,
  publisher    = "JMLR.org",
  keywords     = "deep learning, global optimization, model selection, neural
                  networks, response surface modeling"
}

@article{masters-2018-revis-small,
  author       = "Dominic Masters and Carlo Luschi",
  title        = "Revisiting Small Batch Training for Deep Neural Networks",
  journal      = "CoRR",
  volume       = "abs/1804.07612",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.07612",
  archivePrefix= "arXiv",
  eprint       = "1804.07612",
  timestamp    = "Mon, 13 Aug 2018 16:48:13 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-07612",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{tsuruoka-2009-sgd-l1,
  title        = "Stochastic gradient descent training for l1-regularized
                  log-linear models with cumulative penalty",
  author       = "Tsuruoka, Yoshimasa and Tsujii, Jun'ichi and Ananiadou,
                  Sophia",
  booktitle    = "Proceedings of the Joint Conference of the 47th Annual
                  Meeting of the ACL and the 4th International Joint Conference
                  on Natural Language Processing of the AFNLP: Volume 1-Volume
                  1",
  pages        = "477-485",
  year         = 2009,
  organization = "Association for Computational Linguistics"
}

@inproceedings{wilson-2017-margin-value,
  title        = "The Marginal Value of Adaptive Gradient Methods in Machine
                  Learning",
  author       = "Wilson, Ashia C and Roelofs, Rebecca and Stern, Mitchell and
                  Srebro, Nati and Recht, Benjamin",
  booktitle    = "Advances in Neural Information Processing Systems 30",
  editor       = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
                  R. Fergus and S. Vishwanathan and R. Garnett",
  pages        = "4148-4158",
  year         = 2017,
  publisher    = "Curran Associates, Inc.",
  url          =
  "http://papers.nips.cc/paper/7003-the-marginal-value-of-adaptive-gradient-methods-in-machine-learning.pdf"
}

@inproceedings{hoffer-2017-train-longer,
  title        = "Train longer, generalize better: closing the generalization
                  gap in large batch training of neural networks",
  author       = "Hoffer, Elad and Hubara, Itay and Soudry, Daniel",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "1731-1741",
  year         = 2017
}

@inproceedings{santurkar-2018-how,
  title        = "How does batch normalization help optimization?",
  author       = "Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew
                  and Madry, Aleksander",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "2483-2493",
  year         = 2018
}

@article{breiman-2001-statistical-modeling,
  title        = "Statistical modeling: The two cultures (with comments and a
                  rejoinder by the author)",
  author       = "Breiman, Leo and others",
  journal      = "Statistical science",
  volume       = 16,
  number       = 3,
  pages        = "199-231",
  year         = 2001,
  publisher    = "Institute of Mathematical Statistics"
}

@article{howard-2018-fine-lang,
  author       = "Jeremy Howard and Sebastian Ruder",
  title        = "Fine-tuned Language Models for Text Classification",
  journal      = "CoRR",
  volume       = "abs/1801.06146",
  year         = 2018,
  url          = "http://arxiv.org/abs/1801.06146",
  archivePrefix= "arXiv",
  eprint       = "1801.06146",
  timestamp    = "Mon, 13 Aug 2018 16:46:54 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1801-06146",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{bai-2018-tcn,
  author       = "Shaojie Bai and J. Zico Kolter and Vladlen Koltun",
  title        = "An Empirical Evaluation of Generic Convolutional and
                  Recurrent Networks for Sequence Modeling",
  journal      = "CoRR",
  volume       = "abs/1803.01271",
  year         = 2018,
  url          = "http://arxiv.org/abs/1803.01271",
  archivePrefix= "arXiv",
  eprint       = "1803.01271",
  timestamp    = "Mon, 13 Aug 2018 16:47:39 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1803-01271",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{nemirovski-2009-robust,
  title        = "Robust stochastic approximation approach to stochastic
                  programming",
  author       = "Nemirovski, Arkadi and Juditsky, Anatoli and Lan, Guanghui
                  and Shapiro, Alexander",
  journal      = "SIAM Journal on optimization",
  volume       = 19,
  number       = 4,
  pages        = "1574-1609",
  year         = 2009,
  publisher    = "SIAM"
}

@article{todorov-2016-optim,
  year         = 2016,
  title        = "Optimal control theory",
  author       = "Todorov, Emanuel",
  journal      = "Bayesian brain: probabilistic approaches to neural coding",
  pages        = "269-298",
  publisher    = "MIT Press Cambridge (Massachusetts)"
}

@article{zhao-2019-chines-word-segmen,
  author       = "Hai Zhao and Deng Cai and Changning Huang and Chunyu Kit",
  title        = "Chinese Word Segmentation: Another Decade Review
                  {(2007-2017)}",
  journal      = "CoRR",
  volume       = "abs/1901.06079",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.06079",
  archivePrefix= "arXiv",
  eprint       = "1901.06079",
  timestamp    = "Fri, 01 Feb 2019 13:39:59 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1901-06079",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{johnson-2013-svrg,
  title        = "Accelerating stochastic gradient descent using predictive
                  variance reduction",
  author       = "Johnson, Rie and Zhang, Tong",
  booktitle    = "Advances in neural information processing systems",
  pages        = "315-323",
  year         = 2013
}

@article{defazio-2014-saga,
  author       = "Aaron Defazio and Francis R. Bach and Simon Lacoste{-}Julien",
  title        = "{SAGA:} {A} Fast Incremental Gradient Method With Support for
                  Non-Strongly Convex Composite Objectives",
  journal      = "CoRR",
  volume       = "abs/1407.0202",
  year         = 2014,
  url          = "http://arxiv.org/abs/1407.0202",
  archivePrefix= "arXiv",
  eprint       = "1407.0202",
  timestamp    = "Mon, 13 Aug 2018 16:46:52 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/DefazioBL14",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{pinter-2017-mimic-word,
  author       = "Yuval Pinter and Robert Guthrie and Jacob Eisenstein",
  title        = "Mimicking Word Embeddings using Subword RNNs",
  journal      = "CoRR",
  volume       = "abs/1707.06961",
  year         = 2017,
  url          = "http://arxiv.org/abs/1707.06961",
  archivePrefix= "arXiv",
  eprint       = "1707.06961",
  timestamp    = "Mon, 13 Aug 2018 16:46:53 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/PinterGE17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{kiperwasser-2016-simpl-accur,
  title        = "Simple and Accurate Dependency Parsing Using Bidirectional
                  LSTM Feature Representations",
  author       = "Kiperwasser, Eliyahu and Goldberg, Yoav",
  journal      = "Transactions of the Association for Computational
                  Linguistics",
  volume       = 4,
  year         = 2016,
  url          = "https://www.aclweb.org/anthology/Q16-1023",
  pages        = "313-327",
  abstract     = "We present a simple and effective scheme for dependency
                  parsing which is based on bidirectional-LSTMs (BiLSTMs). Each
                  sentence token is associated with a BiLSTM vector
                  representing the token in its sentential context, and feature
                  vectors are constructed by concatenating a few BiLSTM
                  vectors. The BiLSTM is trained jointly with the parser
                  objective, resulting in very effective feature extractors for
                  parsing. We demonstrate the effectiveness of the approach by
                  applying it to a greedy transition-based parser as well as to
                  a globally optimized graph-based parser. The resulting
                  parsers have very simple architectures, and match or surpass
                  the state-of-the-art accuracies on English and Chinese."
}

@article{baltescu-2014-pragm,
  title        = "Pragmatic neural language modelling in machine translation",
  author       = "Baltescu, Paul and Blunsom, Phil",
  journal      = "arXiv preprint arXiv:1412.7119",
  year         = 2014
}

@article{cooijmans-2016-recur-batch-normal,
  author       = "Tim Cooijmans and Nicolas Ballas and C{\'{e}}sar Laurent and
                  Aaron C. Courville",
  title        = "Recurrent Batch Normalization",
  journal      = "CoRR",
  volume       = "abs/1603.09025",
  year         = 2016,
  url          = "http://arxiv.org/abs/1603.09025",
  archivePrefix= "arXiv",
  eprint       = "1603.09025",
  timestamp    = "Mon, 13 Aug 2018 16:48:30 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/CooijmansBLC16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{dehghani-2018-univer-trans,
  author       = "Mostafa Dehghani and Stephan Gouws and Oriol Vinyals and
                  Jakob Uszkoreit and Lukasz Kaiser",
  title        = "Universal Transformers",
  journal      = "CoRR",
  volume       = "abs/1807.03819",
  year         = 2018,
  url          = "http://arxiv.org/abs/1807.03819",
  archivePrefix= "arXiv",
  eprint       = "1807.03819",
  timestamp    = "Mon, 13 Aug 2018 16:49:11 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1807-03819",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{dai-2019-trans-xl,
  author       = "Zihang Dai and Zhilin Yang and Yiming Yang and Jaime
                  G. Carbonell and Quoc V. Le and Ruslan Salakhutdinov",
  title        = "Transformer-XL: Attentive Language Models Beyond a
                  Fixed-Length Context",
  journal      = "CoRR",
  volume       = "abs/1901.02860",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.02860",
  archivePrefix= "arXiv",
  eprint       = "1901.02860",
  timestamp    = "Fri, 01 Feb 2019 13:39:59 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1901-02860",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{popel-2018-train-tips-trans-model,
  author       = "Martin Popel and Ondrej Bojar",
  title        = "Training Tips for the Transformer Model",
  journal      = "CoRR",
  volume       = "abs/1804.00247",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.00247",
  archivePrefix= "arXiv",
  eprint       = "1804.00247",
  timestamp    = "Mon, 13 Aug 2018 16:47:13 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-00247",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{liu-2019-linguis-knowl,
  author       = "Nelson F. Liu and Matt Gardner and Yonatan Belinkov and
                  Matthew Peters and Noah A. Smith",
  title        = "Linguistic Knowledge and Transferability of Contextual
                  Representations",
  journal      = "CoRR",
  volume       = "abs/1903.08855",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.08855",
  archivePrefix= "arXiv",
  eprint       = "1903.08855",
  timestamp    = "Mon, 01 Apr 2019 14:07:37 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1903-08855",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{peters-2019-to-tune-not-tune,
  author       = "Matthew Peters and Sebastian Ruder and Noah A. Smith",
  title        = "To Tune or Not to Tune? Adapting Pretrained Representations
                  to Diverse Tasks",
  journal      = "CoRR",
  volume       = "abs/1903.05987",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.05987",
  archivePrefix= "arXiv",
  eprint       = "1903.05987",
  timestamp    = "Sun, 31 Mar 2019 19:01:24 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1903-05987",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{li-2011-learn-to-rank,
  title        = "A short introduction to learning to rank",
  author       = "Li, Hang",
  journal      = "IEICE TRANSACTIONS on Information and Systems",
  volume       = 94,
  number       = 10,
  pages        = "1854-1862",
  year         = 2011,
  publisher    = "The Institute of Electronics, Information and Communication
                  Engineers"
}

@article{burges-2010-from-ranknet,
  title        = "From ranknet to lambdarank to lambdamart: An overview",
  author       = "Burges, Christopher JC",
  journal      = "Learning",
  volume       = 11,
  number       = "23-581",
  pages        = 81,
  year         = 2010
}

@Article{geurts-2006-extreme,
  author       = "Geurts, Pierre and Ernst, Damien and Wehenkel, Louis",
  title        = "Extremely randomized trees",
  journal      = "Machine Learning",
  year         = 2006,
  month        = "Apr",
  day          = 01,
  volume       = 63,
  number       = 1,
  pages        = "3-42",
  abstract     = "This paper proposes a new tree-based ensemble method for
                  supervised classification and regression problems. It
                  essentially consists of randomizing strongly both attribute
                  and cut-point choice while splitting a tree node. In the
                  extreme case, it builds totally randomized trees whose
                  structures are independent of the output values of the
                  learning sample. The strength of the randomization can be
                  tuned to problem specifics by the appropriate choice of a
                  parameter. We evaluate the robustness of the default choice
                  of this parameter, and we also provide insight on how to
                  adjust it in particular situations. Besides accuracy, the
                  main strength of the resulting algorithm is computational
                  efficiency. A bias/variance analysis of the Extra-Trees
                  algorithm is also provided as well as a geometrical and a
                  kernel characterization of the models induced.",
  issn         = "1573-0565",
  doi          = "10.1007/s10994-006-6226-1",
  url          = "https://doi.org/10.1007/s10994-006-6226-1"
}

@ARTICLE{chase-2014-thres-class,
  author       = "{Chase Lipton}, Zachary and {Elkan}, Charles and
                  {Narayanaswamy}, Balakrishnan",
  title        = "{Thresholding Classifiers to Maximize F1 Score}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Information
                  Retrieval, Computer Science - Machine Learning",
  year         = 2014,
  month        = "Feb",
  eid          = "arXiv:1402.1892",
  pages        = "arXiv:1402.1892",
  archivePrefix= "arXiv",
  eprint       = "1402.1892",
  primaryClass = "stat.ML",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2014arXiv1402.1892C",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{baak-2018-phik,
  author       = "{Baak}, M. and {Koopman}, R. and {Snoek}, H. and {Klous}, S.",
  title        = "{A new correlation coefficient between categorical, ordinal
                  and interval variables with Pearson characteristics}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Methodology",
  year         = 2018,
  month        = "Nov",
  eid          = "arXiv:1811.11440",
  pages        = "arXiv:1811.11440",
  archivePrefix= "arXiv",
  eprint       = "1811.11440",
  primaryClass = "stat.ME",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2018arXiv181111440B",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{li-2008-learn-rank,
  title        = "Learning to Rank Using Classification and Gradient Boosting",
  author       = "Ping Li",
  booktitle    = "NIPS 2008",
  year         = 2008
}

@inproceedings{li-2007-mcrank,
  title        = "McRank: Learning to Rank Using Multiple Classification and
                  Gradient Boosting",
  author       = "P. H. W. Li and Christopher J. C. Burges and Qiang Wu",
  booktitle    = "NIPS",
  year         = 2007
}

@inproceedings{ke-2017-lightgbm,
  title        = "Lightgbm: A highly efficient gradient boosting decision tree",
  author       = "Ke, Guolin and Meng, Qi and Finley, Thomas and Wang, Taifeng
                  and Chen, Wei and Ma, Weidong and Ye, Qiwei and Liu, Tie-Yan",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "3146-3154",
  year         = 2017
}

@incollection{schapire-2013-explain-adaboost,
  title        = "Explaining adaboost",
  author       = "Schapire, Robert E",
  booktitle    = "Empirical inference",
  pages        = "37-52",
  year         = 2013,
  publisher    = "Springer"
}

@inproceedings{pardoe-2010-boost-regres-trans,
  author       = "Pardoe, David and Stone, Peter",
  title        = "Boosting for Regression Transfer",
  booktitle    = "Proceedings of the 27th International Conference on
                  International Conference on Machine Learning",
  series       = "ICML'10",
  year         = 2010,
  isbn         = "978-1-60558-907-7",
  location     = "Haifa, Israel",
  pages        = "863-870",
  numpages     = 8,
  url          = "http://dl.acm.org/citation.cfm?id=3104322.3104432",
  acmid        = 3104432,
  publisher    = "Omnipress",
  address      = "USA"
}

@article{dorogush-2018-catboost,
  author       = "Anna Veronika Dorogush and Vasily Ershov and Andrey Gulin",
  title        = "CatBoost: gradient boosting with categorical features
                  support",
  journal      = "CoRR",
  volume       = "abs/1810.11363",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.11363",
  archivePrefix= "arXiv",
  eprint       = "1810.11363",
  timestamp    = "Wed, 31 Oct 2018 14:24:29 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1810-11363",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{freund-1997-decis-theor,
  title        = "A Decision-Theoretic Generalization of On-Line Learning and
                  an Application to Boosting",
  journal      = "Journal of Computer and System Sciences",
  volume       = 55,
  number       = 1,
  pages        = "119-139",
  year         = 1997,
  issn         = "0022-0000",
  doi          = "https://doi.org/10.1006/jcss.1997.1504",
  url          =
  "http://www.sciencedirect.com/science/article/pii/S002200009791504X",
  author       = "Yoav Freund and Robert E Schapire",
  abstract     = "In the first part of the paper we consider the problem of
                  dynamically apportioning resources among a set of options in
                  a worst-case on-line framework. The model we study can be
                  interpreted as a broad, abstract extension of the
                  well-studied on-line prediction model to a general
                  decision-theoretic setting. We show that the multiplicative
                  weight-update Littlestone–Warmuth rule can be adapted to this
                  model, yielding bounds that are slightly weaker in some
                  cases, but applicable to a considerably more general class of
                  learning problems. We show how the resulting learning
                  algorithm can be applied to a variety of problems, including
                  gambling, multiple-outcome prediction, repeated games, and
                  prediction of points in Rn. In the second part of the paper
                  we apply the multiplicative weight-update technique to derive
                  a new boosting algorithm. This boosting algorithm does not
                  require any prior knowledge about the performance of the weak
                  learning algorithm. We also study generalizations of the new
                  boosting algorithm to the problem of learning functions whose
                  range, rather than being binary, is an arbitrary finite set
                  or a bounded segment of the real line."
}

@inproceedings{niculescu-mizil-2005-predic,
  title        = "Predicting good probabilities with supervised learning",
  author       = "Niculescu-Mizil, Alexandru and Caruana, Rich",
  booktitle    = "Proceedings of the 22nd international conference on Machine
                  learning",
  pages        = "625-632",
  year         = 2005,
  organization = "ACM"
}

@article{kaufman-2012-leakage,
  title        = "Leakage in data mining: Formulation, detection, and
                  avoidance",
  author       = "Kaufman, Shachar and Rosset, Saharon and Perlich, Claudia and
                  Stitelman, Ori",
  journal      = "ACM Transactions on Knowledge Discovery from Data (TKDD)",
  volume       = 6,
  number       = 4,
  pages        = 15,
  year         = 2012,
  publisher    = "ACM"
}

@article{micci-barreca-2001-target-encoding,
  author       = "Micci-Barreca, Daniele",
  title        = "A Preprocessing Scheme for High-cardinality Categorical
                  Attributes in Classification and Prediction Problems",
  journal      = "SIGKDD Explor. Newsl.",
  issue_date   = "July 2001",
  volume       = 3,
  number       = 1,
  month        = jul,
  year         = 2001,
  issn         = "1931-0145",
  pages        = "27-32",
  numpages     = 6,
  url          = "http://doi.acm.org/10.1145/507533.507538",
  doi          = "10.1145/507533.507538",
  acmid        = 507538,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "categorical attributes, empirical bayes, hierarchical
                  attributes, neural networks, predictive models"
}

@phdthesis{shi-2007-best,
  title        = "Best-first decision tree learning",
  author       = "Shi, Haijian",
  year         = 2007,
  school       = "The University of Waikato"
}

@article{fisher-1958-group-maxim-homog,
  author       = "Fisher, Walter D",
  title        = "On Grouping for Maximum Homogeneity",
  journal      = "Journal of the American statistical Association",
  volume       = 53,
  number       = 284,
  pages        = "789-798",
  year         = 1958,
  publisher    = "Taylor \\& Francis"
}

@article{friedman-2010-regul-paths,
  author       = "Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob",
  title        = "Regularization Paths for Generalized Linear Models Via
                  Coordinate Descent",
  journal      = "Journal of statistical software",
  volume       = 33,
  number       = 1,
  pages        = 1,
  year         = 2010,
  publisher    = "NIH Public Access"
}

@inproceedings{kerber-1992-chimerge,
  author       = "Kerber, Randy",
  title        = "Chimerge: Discretization of numeric attributes",
  booktitle    = "Proceedings of the tenth national conference on Artificial
                  intelligence",
  year         = 1992,
  pages        = "123-128",
  organization = "Aaai Press"
}

@article{harrell-2017-regres-model-strat,
  author       = "Harrell Jr, Frank E",
  title        = "Regression Modeling Strategies",
  journal      = "BIOS",
  volume       = 330,
  year         = 2017
}

@article{ribeiro-2016-lime,
  author       = "Marco T{\'{u}}lio Ribeiro and Sameer Singh and Carlos
                  Guestrin",
  title        = "``Why Should {I} Trust You?'': Explaining the Predictions of
                  Any Classifier",
  journal      = "CoRR",
  volume       = "abs/1602.04938",
  year         = 2016,
  url          = "http://arxiv.org/abs/1602.04938",
  archivePrefix= "arXiv",
  eprint       = "1602.04938",
  timestamp    = "Mon, 13 Aug 2018 16:49:09 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/RibeiroSG16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{lei-2016-dist-free,
  author       = "{Lei}, Jing and {G'Sell}, Max and {Rinaldo}, Alessandro and
                  {Tibshirani}, Ryan J. and {Wasserman}, Larry",
  title        = "{Distribution-Free Predictive Inference For Regression}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Methodology, Mathematics - Statistics Theory,
                  Statistics - Machine Learning",
  year         = 2016,
  month        = "Apr",
  eid          = "arXiv:1604.04173",
  pages        = "arXiv:1604.04173",
  archivePrefix= "arXiv",
  eprint       = "1604.04173",
  primaryClass = "stat.ME",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160404173L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@incollection{lundberg-2017-unified-approac,
  author       = "Lundberg, Scott M and Lee, Su-In",
  booktitle    = "Advances in Neural Information Processing Systems 30",
  editor       = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
                  R. Fergus and S. Vishwanathan and R. Garnett",
  pages        = "4765-4774",
  publisher    = "Curran Associates, Inc.",
  title        = "A Unified Approach to Interpreting Model Predictions",
  url          =
  "http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf",
  year         = 2017
}

@inproceedings{kohavi-1995-study-cross,
  author       = "Kohavi, Ron",
  title        = "A Study of Cross-validation and Bootstrap for Accuracy
                  Estimation and Model Selection",
  booktitle    = "Proceedings of the 14th International Joint Conference on
                  Artificial Intelligence - Volume 2",
  series       = "IJCAI'95",
  year         = 1995,
  isbn         = "1-55860-363-8",
  location     = "Montreal, Quebec, Canada",
  pages        = "1137-1143",
  numpages     = 7,
  url          = "http://dl.acm.org/citation.cfm?id=1643031.1643047",
  acmid        = 1643047,
  publisher    = "Morgan Kaufmann Publishers Inc.",
  address      = "San Francisco, CA, USA"
}

@inproceedings{kanter-2015-deep,
  author       = "Kanter, James Max and Veeramachaneni, Kalyan",
  title        = "Deep feature synthesis: Towards automating data science
                  endeavors",
  booktitle    = "2015 IEEE International Conference on Data Science and
                  Advanced Analytics (DSAA)",
  year         = 2015,
  pages        = "1-10",
  organization = "IEEE"
}

@article{yuan-2006-group-lasso,
  author       = "Yuan, Ming and Lin, Yi",
  title        = "Model Selection and Estimation in Regression With Grouped
                  Variables",
  journal      = "Journal of the Royal Statistical Society: Series B
                  (Statistical Methodology)",
  volume       = 68,
  number       = 1,
  pages        = "49-67",
  year         = 2006,
  publisher    = "Wiley Online Library"
}

@article{tibshirani-2005-fused-lasso,
  author       = "Tibshirani, Robert and Saunders, Michael and Rosset, Saharon
                  and Zhu, Ji and Knight, Keith",
  title        = "Sparsity and Smoothness Via the Fused Lasso",
  journal      = "Journal of the Royal Statistical Society: Series B
                  (Statistical Methodology)",
  volume       = 67,
  number       = 1,
  pages        = "91-108",
  year         = 2005,
  publisher    = "Wiley Online Library"
}

@ARTICLE{gregorutti-2013-correl,
  author       = "{Gregorutti}, Baptiste and {Michel}, Bertrand and
                  {Saint-Pierre}, Philippe",
  title        = "{Correlation and variable importance in random forests}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Methodology",
  year         = 2013,
  month        = "Oct",
  eid          = "arXiv:1310.5726",
  pages        = "arXiv:1310.5726",
  archivePrefix= "arXiv",
  eprint       = "1310.5726",
  primaryClass = "stat.ME",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2013arXiv1310.5726G",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{he-2008-adasyn,
  author       = "He, Haibo and Bai, Yang and Garcia, Edwardo A and Li, Shutao",
  title        = "ADASYN: Adaptive synthetic sampling approach for imbalanced
                  learning",
  booktitle    = "2008 IEEE International Joint Conference on Neural Networks
                  (IEEE World Congress on Computational Intelligence)",
  year         = 2008,
  pages        = "1322-1328",
  organization = "IEEE"
}

@inproceedings{han-2005-border-smote,
  author       = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
  title        = "Borderline-SMOTE: a new over-sampling method in imbalanced
                  data sets learning",
  booktitle    = "International conference on intelligent computing",
  year         = 2005,
  pages        = "878-887",
  organization = "Springer"
}

@inproceedings{nguyen-2009-border,
  author       = "Nguyen, Hien M and Cooper, Eric W and Kamei, Katsuari",
  title        = "Borderline over-sampling for imbalanced data classification",
  booktitle    = "Proceedings: Fifth International Workshop on Computational
                  Intelligence \\& Applications",
  year         = 2009,
  volume       = 2009,
  number       = 1,
  pages        = "24-29",
  organization = "IEEE SMC Hiroshima Chapter"
}

@article{last-2017-overs-imbal,
  author       = "Felix Last and Georgios Douzas and Fernando
                  Ba{\c{c}}{\~{a}}o",
  title        = "Oversampling for Imbalanced Learning Based on K-Means and
                  {SMOTE}",
  journal      = "CoRR",
  volume       = "abs/1711.00837",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.00837",
  archivePrefix= "arXiv",
  eprint       = "1711.00837",
  timestamp    = "Wed, 10 Oct 2018 15:58:34 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1711-00837",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{mani-2003-knn,
  author       = "Mani, Inderjeet and Zhang, I",
  title        = "kNN approach to unbalanced data distributions: a case study
                  involving information extraction",
  booktitle    = "Proceedings of workshop on learning from imbalanced datasets",
  year         = 2003,
  volume       = 126
}

@article{tomek-1976-two-modif-cnn,
  added-at     = "2007-08-22T12:37:55.000+0200",
  author       = "Tomek, I.",
  biburl       =
  "https://www.bibsonomy.org/bibtex/2523c1d70243d3fe9035269af8f6f5ecd/bsmyth",
  description  = "AI 2001 Elizabeth McKenna Barry Smyth",
  interhash    = "379fe276cf4a77f8fba21a949b2d72d6",
  intrahash    = "523c1d70243d3fe9035269af8f6f5ecd",
  journal      = "{IEEE Transactions on Systems, Man, and Cybernetics}",
  keywords     = "imported",
  pages        = "679-772",
  timestamp    = "2007-08-22T12:37:55.000+0200",
  title        = "{Two Modifications of CNN}",
  volume       = "7(2)",
  year         = 1976
}

@ARTICLE{wilson-1972-asymp-proper,
  author       = "D. L. {Wilson}",
  journal      = "IEEE Transactions on Systems, Man, and Cybernetics",
  title        = "Asymptotic Properties of Nearest Neighbor Rules Using Edited
                  Data",
  year         = 1972,
  volume       = "SMC-2",
  number       = 3,
  pages        = "408-421",
  keywords     = "Nearest neighbor searches;Random
                  variables;Convergence;Character recognition;Decoding;Pattern
                  recognition",
  doi          = "10.1109/TSMC.1972.4309137",
  ISSN         = "0018-9472",
  month        = "July"
}

@article{hand-1978-exper,
  title        = "Experiments on the edited condensed nearest neighbor rule",
  journal      = "Information Sciences",
  volume       = 14,
  number       = 3,
  pages        = "171-180",
  year         = 1978,
  issn         = "0020-0255",
  doi          = "https://doi.org/10.1016/0020-0255(78)90040-3",
  url          =
  "http://www.sciencedirect.com/science/article/pii/0020025578900403",
  author       = "D.J. Hand and B.G. Batchelor",
  abstract     = "Tomek's preprocessing scheme is discussed for editing the
                  training set prior to analyzing it by Hart's condensed
                  nearest neighbor technique. Preprocessing was performed by a
                  κ-nearest-neighbor pdf estimation scheme, although other
                  methods are suggested in this paper. The procedure was
                  studied experimentally and was found to achieve a significant
                  reduction in the storage requirements of the CNN method while
                  maintaining approximately the same error rate, or even
                  improving it."
}

@article{hart-2006-conden-neares,
  author       = "Hart, P.",
  title        = "The Condensed Nearest Neighbor Rule (Corresp.)",
  journal      = "IEEE Trans. Inf. Theor.",
  issue_date   = "May 1968",
  volume       = 14,
  number       = 3,
  month        = sep,
  year         = 2006,
  issn         = "0018-9448",
  pages        = "515-516",
  numpages     = 2,
  url          = "https://doi.org/10.1109/TIT.1968.1054155",
  doi          = "10.1109/TIT.1968.1054155",
  acmid        = 2267647,
  publisher    = "IEEE Press",
  address      = "Piscataway, NJ, USA"
}

@inproceedings{kubat-1997-addres-curse,
  title        = "Addressing the Curse of Imbalanced Training Sets: One-Sided
                  Selection",
  author       = "Miroslav Kubat and Stan Matwin",
  booktitle    = "ICML",
  year         = 1997
}

@inproceedings{laurikkala-2001-improv,
  title        = "Improving identification of difficult small classes by
                  balancing class distribution",
  author       = "Laurikkala, Jorma",
  booktitle    = "Conference on Artificial Intelligence in Medicine in Europe",
  pages        = "63-66",
  year         = 2001,
  organization = "Springer"
}

@article{smith-2014-instan-level,
  author       = "Smith, Michael R. and Martinez, Tony and Giraud-Carrier,
                  Christophe",
  title        = "An Instance Level Analysis of Data Complexity",
  journal      = "Mach. Learn.",
  issue_date   = "May 2014",
  volume       = 95,
  number       = 2,
  month        = may,
  year         = 2014,
  issn         = "0885-6125",
  pages        = "225-256",
  numpages     = 32,
  url          = "https://doi.org/10.1007/s10994-013-5422-z",
  doi          = "10.1007/s10994-013-5422-z",
  acmid        = 2843686,
  publisher    = "Kluwer Academic Publishers",
  address      = "Norwell, MA, USA",
  keywords     = "Data complexity, Dataset hardness, Instance hardness"
}

@article{batista-2004-study-behav,
  author       = "Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria
                  Carolina",
  title        = "A Study of the Behavior of Several Methods for Balancing
                  Machine Learning Training Data",
  journal      = "ACM SIGKDD explorations newsletter",
  volume       = 6,
  number       = 1,
  pages        = "20-29",
  year         = 2004,
  publisher    = "ACM"
}

@MISC{batista-2003-balan-train,
  author       = "Gustavo E. A. P. A. Batista and Ana L. C. Bazzan and Maria
                  Carolina Monard",
  title        = "Balancing Training Data for Automated Annotation of Keywords:
                  a Case Study",
  year         = 2003
}

@Article{andrieu-2003-introd-mcmc-machin-learn,
  author       = "Andrieu, Christophe and de Freitas, Nando and Doucet, Arnaud
                  and Jordan, Michael I.",
  title        = "An Introduction to MCMC for Machine Learning",
  journal      = "Machine Learning",
  year         = 2003,
  month        = "Jan",
  day          = 01,
  volume       = 50,
  number       = 1,
  pages        = "5-43",
  abstract     = "This purpose of this introductory paper is threefold. First,
                  it introduces the Monte Carlo method with emphasis on
                  probabilistic machine learning. Second, it reviews the main
                  building blocks of modern Markov chain Monte Carlo
                  simulation, thereby providing and introduction to the
                  remaining papers of this special issue. Lastly, it discusses
                  new interesting research horizons.",
  issn         = "1573-0565",
  doi          = "10.1023/A:1020281327116",
  url          = "https://doi.org/10.1023/A:1020281327116"
}


@article{scholkopf-2000-new-suppor-vector-algor,
  title        = "New Support Vector Algorithms",
  author       = "Sch{\\\"o}lkopf, Bernhard and Smola, Alex J and Williamson,
                  Robert C and Bartlett, Peter L",
  journal      = "Neural computation",
  volume       = 12,
  number       = 5,
  pages        = "1207-1245",
  year         = 2000,
  publisher    = "MIT Press"
}

@article{scholkopf-2001-estim-suppor,
  title        = "Estimating the Support of a High-Dimensional Distribution",
  author       = "Sch{\\\"o}lkopf, Bernhard and Platt, John C and Shawe-Taylor,
                  John and Smola, Alex J and Williamson, Robert C",
  journal      = "Neural computation",
  volume       = 13,
  number       = 7,
  pages        = "1443-1471",
  year         = 2001,
  publisher    = "MIT Press"
}

@article{lampert-2009-kernel-method-comput-vision,
  author       = "Lampert, Christoph H and others",
  title        = "Kernel Methods in Computer Vision",
  journal      = "Foundations and Trends{\\textregistered} in Computer Graphics
                  and Vision",
  volume       = 4,
  number       = 3,
  pages        = "193-285",
  year         = 2009,
  publisher    = "Now Publishers, Inc."
}

@article{tax-2004-suppor-vector-data-descr,
  author       = "Tax, David MJ and Duin, Robert PW",
  title        = "Support Vector Data Description",
  journal      = "Machine learning",
  volume       = 54,
  number       = 1,
  pages        = "45-66",
  year         = 2004,
  publisher    = "Springer"
}

@inproceedings{liu-2008-isolat,
  title        = "Isolation forest",
  author       = "Liu, Fei Tony and Ting, Kai Ming and Zhou, Zhi-Hua",
  booktitle    = "2008 Eighth IEEE International Conference on Data Mining",
  pages        = "413-422",
  year         = 2008,
  organization = "IEEE"
}

@inproceedings{breunig-2000-lof,
  title        = "LOF: identifying density-based local outliers",
  author       = "Breunig, Markus M and Kriegel, Hans-Peter and Ng, Raymond T
                  and Sander, J{\"o}rg",
  booktitle    = "ACM sigmod record",
  volume       = 29,
  number       = 2,
  pages        = "93-104",
  year         = 2000,
  organization = "ACM"
}

@article{goyal-2017-accur-large-minib-sgd,
  author       = "Priya Goyal and Piotr Doll{\'{a}}r and Ross B. Girshick and
                  Pieter Noordhuis and Lukasz Wesolowski and Aapo Kyrola and
                  Andrew Tulloch and Yangqing Jia and Kaiming He",
  title        = "Accurate, Large Minibatch {SGD:} Training ImageNet in 1 Hour",
  journal      = "CoRR",
  volume       = "abs/1706.02677",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.02677",
  archivePrefix= "arXiv",
  eprint       = "1706.02677",
  timestamp    = "Mon, 13 Aug 2018 16:49:10 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/GoyalDGNWKTJH17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{howard-2018-univer-languag,
  author       = "{Howard}, Jeremy and {Ruder}, Sebastian",
  title        = "{Universal Language Model Fine-tuning for Text
                  Classification}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Machine Learning, Statistics - Machine Learning",
  year         = 2018,
  month        = "Jan",
  eid          = "arXiv:1801.06146",
  pages        = "arXiv:1801.06146",
  archivePrefix= "arXiv",
  eprint       = "1801.06146",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2018arXiv180106146H",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article {griffiths-2004-finding,
  author       = "Griffiths, Thomas L. and Steyvers, Mark",
  title        = "Finding scientific topics",
  volume       = 101,
  number       = "suppl 1",
  pages        = "5228-5235",
  year         = 2004,
  doi          = "10.1073/pnas.0307752101",
  publisher    = "National Academy of Sciences",
  abstract     = "A first step in identifying the content of a document is
                  determining which topics that document addresses. We describe
                  a generative model for documents, introduced by Blei, Ng, and
                  Jordan [Blei, D. M., Ng, A. Y. \&amp; Jordan, M. I. (2003)
                  J. Machine Learn. Res. 3, 993-1022], in which each document
                  is generated by choosing a distribution over topics and then
                  choosing each word in the document from a topic selected
                  according to this distribution. We then present a Markov
                  chain Monte Carlo algorithm for inference in this model. We
                  use this algorithm to analyze abstracts from PNAS by using
                  Bayesian model selection to establish the number of
                  topics. We show that the extracted topics capture meaningful
                  structure in the data, consistent with the class designations
                  provided by the authors of the articles, and outline further
                  applications of this analysis, including identifying
                  {\textquotedblleft}hot topics{\textquotedblright} by
                  examining temporal dynamics and tagging abstracts to
                  illustrate semantic content.",
  issn         = "0027-8424",
  URL          = "https://www.pnas.org/content/101/suppl_1/5228",
  eprint       = "https://www.pnas.org/content/101/suppl_1/5228.full.pdf",
  journal      = "Proceedings of the National Academy of Sciences"
}

@article{cui-2016-multi-scale,
  author       = "Zhicheng Cui and Wenlin Chen and Yixin Chen",
  title        = "Multi-Scale Convolutional Neural Networks for Time Series
                  Classification",
  journal      = "CoRR",
  volume       = "abs/1603.06995",
  year         = 2016,
  url          = "http://arxiv.org/abs/1603.06995",
  archivePrefix= "arXiv",
  eprint       = "1603.06995",
  timestamp    = "Mon, 13 Aug 2018 16:47:13 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/CuiCC16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{liu-2019-roberta,
  author       = "Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
                  Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
                  Luke Zettlemoyer and Veselin Stoyanov",
  title        = "RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach",
  journal      = "CoRR",
  volume       = "abs/1907.11692",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.11692",
  archivePrefix= "arXiv",
  eprint       = "1907.11692",
  timestamp    = "Thu, 01 Aug 2019 08:59:33 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1907-11692",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{zhao-2012-moodlens,
  author       = "Zhao, Jichang and Dong, Li and Wu, Junjie and Xu, Ke",
  title        = "Moodlens: an emoticon-based sentiment analysis system for
                  chinese tweets",
  booktitle    = "Proceedings of the 18th ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  year         = 2012,
  pages        = "1528-1531",
  organization = "ACM"
}

@article{hsu-2002-comparison-multi-svm,
  title        = "A comparison of methods for multiclass support vector
                  machines",
  author       = "Hsu, Chih-Wei and Lin, Chih-Jen",
  journal      = "IEEE transactions on Neural Networks",
  volume       = 13,
  number       = 2,
  pages        = "415-425",
  year         = 2002,
  publisher    = "IEEE"
}

@article{conneau-2018-what,
  author       = "Alexis Conneau and Germ{\'{a}}n Kruszewski and Guillaume
                  Lample and Lo{\"{\i}}c Barrault and Marco Baroni",
  title        = "What you can cram into a single vector: Probing sentence
                  embeddings for linguistic properties",
  journal      = "CoRR",
  volume       = "abs/1805.01070",
  year         = 2018,
  url          = "http://arxiv.org/abs/1805.01070",
  archivePrefix= "arXiv",
  eprint       = "1805.01070",
  timestamp    = "Mon, 13 Aug 2018 16:48:39 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1805-01070",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@techreport{ester-1996-dbscan,
  author       = "Ester, M and Kriegel, HP and Sander, J and Xiaowei, Xu",
  institution  = "AAAI Press, Menlo Park, CA (United States)",
  title        = "A density-based algorithm for discovering clusters in large
                  spatial databases with noise",
  year         = 1996
}

@article{schubert-2017-dbscan-revisit,
  author       = "Schubert, Erich and Sander, J``{o}rg and Ester, Martin and
                  Kriegel, Hans Peter and Xu, Xiaowei",
  title        = "DBSCAN Revisited, Revisited: Why and How You Should (Still)
                  Use DBSCAN",
  journal      = "ACM Trans. Database Syst.",
  issue_date   = "August 2017",
  volume       = 42,
  number       = 3,
  month        = jul,
  year         = 2017,
  issn         = "0362-5915",
  pages        = "19:1--19:21",
  articleno    = 19,
  numpages     = 21,
  url          = "http://doi.acm.org/10.1145/3068335",
  doi          = "10.1145/3068335",
  acmid        = 3068335,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "DBSCAN, density-based clustering, range-search complexity"
}

@inproceedings{ng-2002-dis-vs-gen,
  author       = "Ng, Andrew Y and Jordan, Michael I",
  title        = "On discriminative vs. generative classifiers: A comparison of
                  logistic regression and naive bayes",
  booktitle    = "Advances in neural information processing systems",
  year         = 2002,
  pages        = "841-848"
}

@article{joulin-2016-fasttext-zip,
  author       = "Armand Joulin and Edouard Grave and Piotr Bojanowski and
                  Matthijs Douze and Herv{\'{e}} J{\'{e}}gou and Tomas Mikolov",
  title        = "FastText.zip: Compressing text classification models",
  journal      = "CoRR",
  volume       = "abs/1612.03651",
  year         = 2016,
  url          = "http://arxiv.org/abs/1612.03651",
  archivePrefix= "arXiv",
  eprint       = "1612.03651",
  timestamp    = "Mon, 13 Aug 2018 16:48:53 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/JoulinGBDJM16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{weston-2014-tagspace,
  title        = "{\#}{T}ag{S}pace: Semantic Embeddings from Hashtags",
  author       = "Weston, Jason and Chopra, Sumit and Adams, Keith",
  booktitle    = "Proceedings of the 2014 Conference on Empirical Methods in
                  Natural Language Processing ({EMNLP})",
  month        = oct,
  year         = 2014,
  address      = "Doha, Qatar",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D14-1194",
  doi          = "10.3115/v1/D14-1194",
  pages        = "1822-1827"
}

@article{li-2015-compon-enhan,
  author       = "Yanran Li and Wenjie Li and Fei Sun and Sujian Li",
  title        = "Component-Enhanced Chinese Character Embeddings",
  journal      = "CoRR",
  volume       = "abs/1508.06669",
  year         = 2015,
  url          = "http://arxiv.org/abs/1508.06669",
  archivePrefix= "arXiv",
  eprint       = "1508.06669",
  timestamp    = "Mon, 13 Aug 2018 16:47:49 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/LiLSL15",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{chen-2015-joint-learn,
  author       = "Chen, Xinxiong and Xu, Lei and Liu, Zhiyuan and Sun, Maosong
                  and Luan, Huanbo",
  title        = "Joint Learning of Character and Word Embeddings",
  booktitle    = "Proceedings of the 24th International Conference on
                  Artificial Intelligence",
  series       = "IJCAI'15",
  year         = 2015,
  isbn         = "978-1-57735-738-4",
  location     = "Buenos Aires, Argentina",
  pages        = "1236-1242",
  numpages     = 7,
  url          = "http://dl.acm.org/citation.cfm?id=2832415.2832421",
  acmid        = 2832421,
  publisher    = "AAAI Press"
}

@article{kudo-2018-subword-regularization,
  author       = "Taku Kudo",
  title        = "Subword Regularization: Improving Neural Network Translation
                  Models with Multiple Subword Candidates",
  journal      = "CoRR",
  volume       = "abs/1804.10959",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.10959",
  archivePrefix= "arXiv",
  eprint       = "1804.10959",
  timestamp    = "Mon, 13 Aug 2018 16:48:57 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-10959",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{wang-2016-atae-lstm,
  author       = "Wang, Yequan and Huang, Minlie and Zhao, Li and others",
  title        = "Attention-based LSTM for aspect-level sentiment
                  classification",
  booktitle    = "Proceedings of the 2016 conference on empirical methods in
                  natural language processing",
  year         = 2016,
  pages        = "606-615"
}

@article{tang-2015-td-lstm,
  author       = "Duyu Tang and Bing Qin and Xiaocheng Feng and Ting Liu",
  title        = "Target-Dependent Sentiment Classification with Long Short
                  Term Memory",
  journal      = "CoRR",
  volume       = "abs/1512.01100",
  year         = 2015,
  url          = "http://arxiv.org/abs/1512.01100",
  archivePrefix= "arXiv",
  eprint       = "1512.01100",
  timestamp    = "Mon, 13 Aug 2018 16:46:55 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/TangQFL15",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{pang-2018-learn-repres,
  author       = "Guansong Pang and Longbing Cao and Ling Chen and Huan Liu",
  title        = "Learning Representations of Ultrahigh-dimensional Data for
                  Random Distance-based Outlier Detection",
  journal      = "CoRR",
  volume       = "abs/1806.04808",
  year         = 2018,
  url          = "http://arxiv.org/abs/1806.04808",
  archivePrefix= "arXiv",
  eprint       = "1806.04808",
  timestamp    = "Mon, 13 Aug 2018 16:46:25 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1806-04808",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{liu-2003-build,
  title        = "Building text classifiers using positive and unlabeled
                  examples",
  author       = "Bing Liu and Yang Dai and Xiaoli Li and Wee Sun Lee and
                  Philip S. Yu",
  journal      = "Third IEEE International Conference on Data Mining",
  year         = 2003,
  pages        = "179-186"
}

@InProceedings{li-2005-pu-learning,
  author       = "Li, Xiao-Li and Liu, Bing",
  editor       = "Gama, Jo{\~a}o and Camacho, Rui and Brazdil, Pavel B.  and
                  Jorge, Al{\'i}pio M{\'a}rio and Torgo, Lu{\'i}s",
  title        = "Learning from Positive and Unlabeled Examples with Different
                  Data Distributions",
  booktitle    = "Machine Learning: ECML 2005",
  year         = 2005,
  publisher    = "Springer Berlin Heidelberg",
  address      = "Berlin, Heidelberg",
  pages        = "218-229",
  abstract     = "We study the problem of learning from positive and unlabeled
                  examples. Although several techniques exist for dealing with
                  this problem, they all assume that positive examples in the
                  positive set P and the positive examples in the unlabeled set
                  U are generated from the same distribution. This assumption
                  may be violated in practice. For example, one wants to
                  collect all printer pages from the Web. One can use the
                  printer pages from one site as the set P of positive pages
                  and use product pages from another site as U. One wants to
                  classify the pages in U into printer pages and non-printer
                  pages. Although printer pages from the two sites have many
                  similarities, they can also be quite different because
                  different sites often present similar products in different
                  styles and have different focuses. In such cases, existing
                  methods perform poorly. This paper proposes a novel technique
                  A-EM to deal with the problem. Experiment results with
                  product page classification demonstrate the effectiveness of
                  the proposed technique.",
  isbn         = "978-3-540-31692-3"
}

@inproceedings{liu-2002-partial-super,
  author       = "Liu, Bing and Lee, Wee Sun and Yu, Philip S. and Li, Xiaoli",
  title        = "Partially Supervised Classification of Text Documents",
  booktitle    = "Proceedings of the Nineteenth International Conference on
                  Machine Learning",
  series       = "ICML '02",
  year         = 2002,
  isbn         = "1-55860-873-7",
  pages        = "387-394",
  numpages     = 8,
  url          = "http://dl.acm.org/citation.cfm?id=645531.656022",
  acmid        = 656022,
  publisher    = "Morgan Kaufmann Publishers Inc.",
  address      = "San Francisco, CA, USA"
}

@inproceedings{wilson-2005-recog-contex,
  author       = "Wilson, Theresa and Wiebe, Janyce and Hoffmann, Paul",
  title        = "Recognizing Contextual Polarity in Phrase-level Sentiment
                  Analysis",
  booktitle    = "Proceedings of the Conference on Human Language Technology
                  and Empirical Methods in Natural Language Processing",
  series       = "HLT '05",
  year         = 2005,
  location     = "Vancouver, British Columbia, Canada",
  pages        = "347-354",
  numpages     = 8,
  url          = "https://doi.org/10.3115/1220575.1220619",
  doi          = "10.3115/1220575.1220619",
  acmid        = 1220619,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@INPROCEEDINGS{liu-2010-sentim,
  author       = "Bing Liu",
  title        = "Sentiment analysis and subjectivity",
  booktitle    = "Handbook of Natural Language Processing, Second
                  Edition. Taylor and Francis Group, Boca",
  year         = 2010
}

@Inbook{liu-2012-survey-opinion,
  author       = "Liu, Bing and Zhang, Lei",
  title        = "A Survey of Opinion Mining and Sentiment Analysis",
  bookTitle    = "Mining Text Data",
  year         = 2012,
  publisher    = "Springer US",
  address      = "Boston, MA",
  pages        = "415-463",
  chapter      = 1,
  abstract     = "Sentiment analysis or opinion mining is the computational
                  study of people's opinions, appraisals, attitudes, and
                  emotions toward entities, individuals, issues, events, topics
                  and their attributes. The task is technically challenging and
                  practically very useful. For example, businesses always want
                  to find public or consumer opinions about their products and
                  services. Potential customers also want to know the opinions
                  of existing users before they use a service or purchase a
                  product.",
  isbn         = "978-1-4614-3223-4",
  doi          = "10.1007/978-1-4614-3223-4_13",
  url          = "https://doi.org/10.1007/978-1-4614-3223-4_13"
}


@InProceedings{conneau-2018-xnli,
  author       = "Conneau, Alexis and Rinott, Ruty and Lample, Guillaume and
                  Williams, Adina and Bowman, Samuel R.  and Schwenk, Holger
                  and Stoyanov, Veselin",
  title        = "XNLI: Evaluating Cross-lingual Sentence Representations",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  year         = 2018,
  publisher    = "Association for Computational Linguistics",
  location     = "Brussels, Belgium"
}

@article{lample-2019-xlms,
  author       = "Guillaume Lample and Alexis Conneau",
  title        = "Cross-lingual Language Model Pretraining",
  journal      = "CoRR",
  volume       = "abs/1901.07291",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.07291",
  archivePrefix= "arXiv",
  eprint       = "1901.07291",
  timestamp    = "Fri, 01 Feb 2019 13:39:59 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1901-07291",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{schuster-2012-japan-korean,
  title        = "Japanese and korean voice search",
  author       = "Schuster, Mike and Nakajima, Kaisuke",
  booktitle    = "2012 IEEE International Conference on Acoustics, Speech and
                  Signal Processing (ICASSP)",
  pages        = "5149-5152",
  year         = 2012,
  organization = "IEEE"
}

@article{shaw-2018-self-atten,
  author       = "Peter Shaw and Jakob Uszkoreit and Ashish Vaswani",
  title        = "Self-Attention with Relative Position Representations",
  journal      = "CoRR",
  volume       = "abs/1803.02155",
  year         = 2018,
  url          = "http://arxiv.org/abs/1803.02155",
  archivePrefix= "arXiv",
  eprint       = "1803.02155",
  timestamp    = "Mon, 13 Aug 2018 16:46:37 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1803-02155",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{al-rfou-2018-charac-level,
  author       = "Rami Al{-}Rfou and Dokook Choe and Noah Constant and Mandy
                  Guo and Llion Jones",
  title        = "Character-Level Language Modeling with Deeper Self-Attention",
  journal      = "CoRR",
  volume       = "abs/1808.04444",
  year         = 2018,
  url          = "http://arxiv.org/abs/1808.04444",
  archivePrefix= "arXiv",
  eprint       = "1808.04444",
  timestamp    = "Sun, 02 Sep 2018 15:01:55 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1808-04444",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{fan-2017-matchzoo,
  author       = "Yixing Fan and Liang Pang and Jianpeng Hou and Jiafeng Guo
                  and Yanyan Lan and Xueqi Cheng",
  title        = "MatchZoo: {A} Toolkit for Deep Text Matching",
  journal      = "CoRR",
  volume       = "abs/1707.07270",
  year         = 2017,
  url          = "http://arxiv.org/abs/1707.07270",
  archivePrefix= "arXiv",
  eprint       = "1707.07270",
  timestamp    = "Mon, 13 Aug 2018 16:48:14 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/FanPHGLC17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{wang-2018-ripplenet,
  author       = "Hongwei Wang and Fuzheng Zhang and Jialin Wang and Miao Zhao
                  and Wenjie Li and Xing Xie and Minyi Guo",
  title        = "Ripple Network: Propagating User Preferences on the Knowledge
                  Graph for Recommender Systems",
  journal      = "CoRR",
  volume       = "abs/1803.03467",
  year         = 2018,
  url          = "http://arxiv.org/abs/1803.03467",
  archivePrefix= "arXiv",
  eprint       = "1803.03467",
  timestamp    = "Mon, 13 Aug 2018 16:48:19 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1803-03467",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{song-2019-mass,
  author       = "Kaitao Song and Xu Tan and Tao Qin and Jianfeng Lu and
                  Tie{-}Yan Liu",
  title        = "{MASS:} Masked Sequence to Sequence Pre-training for Language
                  Generation",
  journal      = "CoRR",
  volume       = "abs/1905.02450",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.02450",
  archivePrefix= "arXiv",
  eprint       = "1905.02450",
  timestamp    = "Mon, 27 May 2019 13:15:00 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1905-02450",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{huang-2013-dssm,
  title        = "Learning deep structured semantic models for web search using
                  clickthrough data",
  author       = "Huang, Po-Sen and He, Xiaodong and Gao, Jianfeng and Deng, Li
                  and Acero, Alex and Heck, Larry",
  booktitle    = "Proceedings of the 22nd ACM international conference on
                  Information \& Knowledge Management",
  pages        = "2333-2338",
  year         = 2013,
  organization = "ACM"
}

@InProceedings{shen-2014-cnn-dssm,
  author       = "Shen, Yelong and He, Xiaodong and Gao, Jianfeng and Deng, Li
                  and Mesnil, Gregoire",
  title        = "A Latent Semantic Model with Convolutional-Pooling Structure
                  for Information Retrieval",
  booktitle    = "CIKM",
  year         = 2014,
  month        = "November",
  abstract     = "In this paper, we propose a new latent semantic model that
                  incorporates a convolutional-pooling structure over word
                  sequences to learn low-dimensional, semantic vector
                  representations for search queries and Web documents. In
                  order to capture the rich contextual structures in a query or
                  a document, we start with each word within a temporal context
                  window in a word sequence to directly capture contextual
                  features at the word n-gram level. Next, the salient word
                  n-gram features in the word sequence are discovered by the
                  model and are then aggregated to form a sentence-level
                  feature vector. Finally, a non-linear transformation is
                  applied to extract high-level semantic information to
                  generate a continuous vector representation for the full text
                  string. The proposed convolutional latent semantic model
                  (CLSM) is trained on clickthrough data and is evaluated on a
                  Web document ranking task using a large-scale, real-world
                  data set. Results show that the proposed model effectively
                  captures salient semantic information in queries and
                  documents for the task while significantly outperforming
                  previous state-of-the-art semantic models.",
  url          =
  "https://www.microsoft.com/en-us/research/publication/a-latent-semantic-model-with-convolutional-pooling-structure-for-information-retrieval/"
}

@article{palangi-2014-lstm-dssm,
  title        = "Semantic modelling with long-short-term memory for
                  information retrieval",
  author       = "Palangi, Hamid and Deng, Li and Shen, Yelong and Gao,
                  Jianfeng and He, Xiaodong and Chen, Jianshu and Song, Xinying
                  and Ward, R",
  journal      = "arXiv preprint arXiv:1412.6629",
  year         = 2014
}

@inproceedings{elkahky-2015-mv-dssm,
  title        = "A multi-view deep learning approach for cross domain user
                  modeling in recommendation systems",
  author       = "Elkahky, Ali Mamdouh and Song, Yang and He, Xiaodong",
  booktitle    = "Proceedings of the 24th International Conference on World
                  Wide Web",
  pages        = "278-288",
  year         = 2015,
  organization = "International World Wide Web Conferences Steering Committee"
}

@inproceedings{qiu-2015-cntn,
  title        = "Convolutional neural tensor network architecture for
                  community-based question answering",
  author       = "Qiu, Xipeng and Huang, Xuanjing",
  booktitle    = "Twenty-Fourth International Joint Conference on Artificial
                  Intelligence",
  year         = 2015
}

@article{庞亮-2017-深度文本匹配综述,
  title        = "深度文本匹配综述",
  author       = "庞亮 and 兰艳艳 and 徐君 and 郭嘉丰 and 万圣贤 and 程学旗",
  journal      = "计算机学报",
  volume       = 40,
  number       = 4,
  pages        = "985-1003",
  year         = 2017
}

@INPROCEEDINGS{chopra-2005-siamese,
  author       = "S. {Chopra} and R. {Hadsell} and Y. {LeCun}",
  booktitle    = "2005 IEEE Computer Society Conference on Computer Vision and
                  Pattern Recognition (CVPR'05)",
  title        = "Learning a similarity metric discriminatively, with
                  application to face verification",
  year         = 2005,
  volume       = 1,
  pages        = "539-546 vol. 1",
  keywords     = "face recognition;learning (artificial
                  intelligence);similarity metric learning;face
                  verification;face recognition;L/sub 1/ norm;semantic distance
                  approximation;discriminative loss function;geometric
                  distortion;Character generation;Drives;Robustness;System
                  testing;Spatial databases;Glass;Artificial neural
                  networks;Support vector machines;Support vector machine
                  classification;Face recognition",
  doi          = "10.1109/CVPR.2005.202",
  month        = "June"
}

@inproceedings{zhai-2016-deepintent,
  author       = "Zhai, Shuangfei and Chang, Keng-hao and Zhang, Ruofei and
                  Zhang, Zhongfei Mark",
  title        = "Deepintent: Learning attentions for online advertising with
                  recurrent neural networks",
  booktitle    = "Proceedings of the 22nd ACM SIGKDD international conference
                  on knowledge discovery and data mining",
  year         = 2016,
  pages        = "1295-1304",
  organization = "ACM"
}

@inproceedings{mitra-2017-learn-to-match,
  author       = "Mitra, Bhaskar and Diaz, Fernando and Craswell, Nick",
  title        = "Learning to match using local and distributed representations
                  of text for web search",
  booktitle    = "Proceedings of the 26th International Conference on World
                  Wide Web",
  year         = 2017,
  pages        = "1291-1299",
  organization = "International World Wide Web Conferences Steering Committee"
}

@inproceedings{tan-2016-improve,
  title        = "Improved representation learning for question answer
                  matching",
  author       = "Tan, Ming and Dos Santos, Cicero and Xiang, Bing and Zhou,
                  Bowen",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  pages        = "464-473",
  year         = 2016
}

@incollection{hu-2014-arc-i,
  title        = "Convolutional Neural Network Architectures for Matching
                  Natural Language Sentences",
  author       = "Hu, Baotian and Lu, Zhengdong and Li, Hang and Chen, Qingcai",
  booktitle    = "Advances in Neural Information Processing Systems 27",
  editor       = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
                  and K. Q. Weinberger",
  pages        = "2042-2050",
  year         = 2014,
  publisher    = "Curran Associates, Inc.",
  url          =
  "http://papers.nips.cc/paper/5550-convolutional-neural-network-architectures-for-matching-natural-language-sentences.pdf"
}


@inproceedings{yin-2015-multigrancnn,
  title        = "Multigrancnn: An architecture for general matching of text
                  chunks on multiple levels of granularity",
  author       = "Yin, Wenpeng and Sch{\"u}tze, Hinrich",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics and the 7th International Joint
                  Conference on Natural Language Processing (Volume 1: Long
                  Papers)",
  pages        = "63-73",
  year         = 2015
}

@article{pang-2016-matchpyramid,
  author       = "Liang Pang and Yanyan Lan and Jiafeng Guo and Jun Xu and
                  Shengxian Wan and Xueqi Cheng",
  title        = "Text Matching as Image Recognition",
  journal      = "CoRR",
  volume       = "abs/1602.06359",
  year         = 2016,
  url          = "http://arxiv.org/abs/1602.06359",
  archivePrefix= "arXiv",
  eprint       = "1602.06359",
  timestamp    = "Mon, 13 Aug 2018 16:47:25 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/PangLGXWC16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@incollection{lu-2013-deepmatch,
  title        = "A Deep Architecture for Matching Short Texts",
  author       = "Lu, Zhengdong and Li, Hang",
  booktitle    = "Advances in Neural Information Processing Systems 26",
  editor       = "C. J. C. Burges and L. Bottou and M. Welling and
                  Z. Ghahramani and K. Q. Weinberger",
  pages        = "1367-1375",
  year         = 2013,
  publisher    = "Curran Associates, Inc.",
  url          =
  "http://papers.nips.cc/paper/5019-a-deep-architecture-for-matching-short-texts.pdf"
}

@inproceedings{zhang-2017-aicnn,
  title        = "Attentive interactive neural networks for answer selection in
                  community question answering",
  author       = "Zhang, Xiaodong and Li, Sujian and Sha, Lei and Wang,
                  Houfeng",
  booktitle    = "Thirty-First AAAI Conference on Artificial Intelligence",
  year         = 2017
}

@inproceedings{sha-2018-mvfnn,
  title        = "A multi-view fusion neural network for answer selection",
  author       = "Sha, Lei and Zhang, Xiaodong and Qian, Feng and Chang, Baobao
                  and Sui, Zhifang",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = 2018
}

@inproceedings{zhang-2018-dqi,
  title        = "Duplicate question identification by integrating framenet
                  with neural networks",
  author       = "Zhang, Xiaodong and Sun, Xu and Wang, Houfeng",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = 2018
}

@article{wan-2016-match-srnn,
  author       = "Shengxian Wan and Yanyan Lan and Jun Xu and Jiafeng Guo and
                  Liang Pang and Xueqi Cheng",
  title        = "Match-SRNN: Modeling the Recursive Matching Structure with
                  Spatial {RNN}",
  journal      = "CoRR",
  volume       = "abs/1604.04378",
  year         = 2016,
  url          = "http://arxiv.org/abs/1604.04378",
  archivePrefix= "arXiv",
  eprint       = "1604.04378",
  timestamp    = "Mon, 13 Aug 2018 16:47:12 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/WanLXGPC16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{tan-2015-qa-lstm,
  author       = "Ming Tan and Bing Xiang and Bowen Zhou",
  title        = "LSTM-based Deep Learning Models for non-factoid answer
                  selection",
  journal      = "CoRR",
  volume       = "abs/1511.04108",
  year         = 2015,
  url          = "http://arxiv.org/abs/1511.04108",
  archivePrefix= "arXiv",
  eprint       = "1511.04108",
  timestamp    = "Mon, 13 Aug 2018 16:46:33 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/TanXZ15",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{xiong-2017-k-nrm,
  author       = "Chenyan Xiong and Zhuyun Dai and Jamie Callan and Zhiyuan Liu
                  and Russell Power",
  title        = "End-to-End Neural Ad-hoc Ranking with Kernel Pooling",
  journal      = "CoRR",
  volume       = "abs/1706.06613",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.06613",
  archivePrefix= "arXiv",
  eprint       = "1706.06613",
  timestamp    = "Mon, 13 Aug 2018 16:49:10 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/XiongDCLP17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{chen-2018-mix,
  title        = "Mix: Multi-channel information crossing for text matching",
  author       = "Chen, Haolan and Han, Fred X and Niu, Di and Liu, Dong and
                  Lai, Kunfeng and Wu, Chenglin and Xu, Yu",
  booktitle    = "Proceedings of the 24th ACM SIGKDD International Conference
                  on Knowledge Discovery \& Data Mining",
  pages        = "110-119",
  year         = 2018,
  organization = "ACM"
}

@inproceedings{zhang-2003-quest-class,
  author       = "Zhang, Dell and Lee, Wee Sun",
  title        = "Question Classification Using Support Vector Machines",
  booktitle    = "Proceedings of the 26th Annual International ACM SIGIR
                  Conference on Research and Development in Informaion
                  Retrieval",
  series       = "SIGIR '03",
  year         = 2003,
  isbn         = "1-58113-646-3",
  location     = "Toronto, Canada",
  pages        = "26-32",
  numpages     = 7,
  url          = "http://doi.acm.org/10.1145/860435.860443",
  doi          = "10.1145/860435.860443",
  acmid        = 860443,
  publisher    = "ACM",
  address      = "New York, NY, USA",
  keywords     = "kernel method, machine learning, question answering, support
                  vector machine, text classification"
}

@inproceedings{li-2002-learn-quest-class,
  author       = "Li, Xin and Roth, Dan",
  title        = "Learning Question Classifiers",
  booktitle    = "Proceedings of the 19th International Conference on
                  Computational Linguistics - Volume 1",
  series       = "COLING '02",
  year         = 2002,
  location     = "Taipei, Taiwan",
  pages        = "1-7",
  numpages     = 7,
  url          = "https://doi.org/10.3115/1072228.1072378",
  doi          = "10.3115/1072228.1072378",
  acmid        = 1072378,
  publisher    = "Association for Computational Linguistics",
  address      = "Stroudsburg, PA, USA"
}

@inproceedings{cui-2004-unsup,
  title        = "Unsupervised learning of soft patterns for generating
                  definitions from online news",
  author       = "Cui, Hang and Kan, Min-Yen and Chua, Tat-Seng",
  booktitle    = "Proceedings of the 13th international conference on World
                  Wide Web",
  pages        = "90-99",
  year         = 2004,
  organization = "ACM"
}

@inproceedings{unger-2012-template-based,
  title        = "Template-based question answering over RDF data",
  author       = "Unger, Christina and B{\"u}hmann, Lorenz and Lehmann, Jens
                  and Ngonga Ngomo, Axel-Cyrille and Gerber, Daniel and
                  Cimiano, Philipp",
  booktitle    = "Proceedings of the 21st international conference on World
                  Wide Web",
  pages        = "639-648",
  year         = 2012,
  organization = "ACM"
}

@inproceedings{abujabal-2017-autom-templ,
  author       = "Abujabal, Abdalghani and Yahya, Mohamed and Riedewald, Mirek
                  and Weikum, Gerhard",
  title        = "Automated Template Generation for Question Answering over
                  Knowledge Graphs",
  booktitle    = "Proceedings of the 26th International Conference on World
                  Wide Web",
  series       = "WWW '17",
  year         = 2017,
  isbn         = "978-1-4503-4913-0",
  location     = "Perth, Australia",
  pages        = "1191-1200",
  numpages     = 10,
  url          = "https://doi.org/10.1145/3038912.3052583",
  doi          = "10.1145/3038912.3052583",
  acmid        = 3052583,
  publisher    = "International World Wide Web Conferences Steering Committee",
  address      = "Republic and Canton of Geneva, Switzerland",
  keywords     = "knowledge graphs, question answering, semantic parsing"
}

@inproceedings{riedel-2010-model,
  title        = "Modeling relations and their mentions without labeled text",
  author       = "Riedel, Sebastian and Yao, Limin and McCallum, Andrew",
  booktitle    = "Joint European Conference on Machine Learning and Knowledge
                  Discovery in Databases",
  pages        = "148-163",
  year         = 2010,
  organization = "Springer"
}

@inproceedings{liu-2017-soft-label,
  title        = "A soft-label method for noise-tolerant distantly supervised
                  relation extraction",
  author       = "Liu, Tianyu and Wang, Kexiang and Chang, Baobao and Sui,
                  Zhifang",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  pages        = "1790-1795",
  year         = 2017
}

@article{feng-2018-reinf-learn,
  author       = "Jun Feng and Minlie Huang and Li Zhao and Yang Yang and
                  Xiaoyan Zhu",
  title        = "Reinforcement Learning for Relation Classification from Noisy
                  Data",
  journal      = "CoRR",
  volume       = "abs/1808.08013",
  year         = 2018,
  url          = "http://arxiv.org/abs/1808.08013",
  archivePrefix= "arXiv",
  eprint       = "1808.08013",
  timestamp    = "Tue, 03 Sep 2019 20:11:19 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1808-08013",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@phdthesis{zhang-2015-deepdive,
  title        = "DeepDive: A Data Management System for Automatic Knowledge
                  Base Construction",
  author       = "Zhang, Ce",
  year         = 2015,
  school       = "UW-Madison"
}

@inproceedings{yao-2014-infor-extrac-struc-data,
  title        = "Information Extraction over Structured Data: Question
                  Answering with {F}reebase",
  author       = "Yao, Xuchen and Van Durme, Benjamin",
  booktitle    = "Proceedings of the 52nd Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jun,
  year         = 2014,
  address      = "Baltimore, Maryland",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P14-1090",
  doi          = "10.3115/v1/P14-1090",
  pages        = "956-966"
}

@article{bordes-2014-quest-answer-subgr-embed,
  author       = "Antoine Bordes and Sumit Chopra and Jason Weston",
  title        = "Question Answering with Subgraph Embeddings",
  journal      = "CoRR",
  volume       = "abs/1406.3676",
  year         = 2014,
  url          = "http://arxiv.org/abs/1406.3676",
  archivePrefix= "arXiv",
  eprint       = "1406.3676",
  timestamp    = "Mon, 13 Aug 2018 16:46:20 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/BordesCW14",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{dong-2015-quest,
  title        = "Question answering over freebase with multi-column
                  convolutional neural networks",
  author       = "Dong, Li and Wei, Furu and Zhou, Ming and Xu, Ke",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics and the 7th International Joint
                  Conference on Natural Language Processing (Volume 1: Long
                  Papers)",
  pages        = "260-269",
  year         = 2015
}

@inproceedings{yih-2015-query-graph,
  title        = "Semantic Parsing via Staged Query Graph Generation: Question
                  Answering with Knowledge Base",
  author       = "Yih, Wen-tau and Chang, Ming-Wei and He, Xiaodong and Gao,
                  Jianfeng",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics and the 7th International Joint
                  Conference on Natural Language Processing (Volume 1: Long
                  Papers)",
  pages        = "1321-1331",
  year         = 2015
}

@article{chen-2017-drqa,
  author       = "Danqi Chen and Adam Fisch and Jason Weston and Antoine
                  Bordes",
  title        = "Reading Wikipedia to Answer Open-Domain Questions",
  journal      = "CoRR",
  volume       = "abs/1704.00051",
  year         = 2017,
  url          = "http://arxiv.org/abs/1704.00051",
  archivePrefix= "arXiv",
  eprint       = "1704.00051",
  timestamp    = "Mon, 13 Aug 2018 16:47:17 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ChenFWB17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{miwa-2014-model,
  title        = "Modeling joint entity and relation extraction with table
                  representation",
  author       = "Miwa, Makoto and Sasaki, Yutaka",
  booktitle    = "Proceedings of the 2014 Conference on Empirical Methods in
                  Natural Language Processing (EMNLP)",
  pages        = "1858-1869",
  year         = 2014
}

@article{zheng-2017-joint-extrac,
  author       = "Suncong Zheng and Feng Wang and Hongyun Bao and Yuexing Hao
                  and Peng Zhou and Bo Xu",
  title        = "Joint Extraction of Entities and Relations Based on a Novel
                  Tagging Scheme",
  journal      = "CoRR",
  volume       = "abs/1706.05075",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.05075",
  archivePrefix= "arXiv",
  eprint       = "1706.05075",
  timestamp    = "Tue, 25 Jun 2019 17:27:14 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ZhengWBHZX17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{luo-2017-learn-noise,
  author       = "Bingfeng Luo and Yansong Feng and Zheng Wang and Zhanxing Zhu
                  and Songfang Huang and Rui Yan and Dongyan Zhao",
  title        = "Learning with Noise: Enhance Distantly Supervised Relation
                  Extraction with Dynamic Transition Matrix",
  journal      = "CoRR",
  volume       = "abs/1705.03995",
  year         = 2017,
  url          = "http://arxiv.org/abs/1705.03995",
  archivePrefix= "arXiv",
  eprint       = "1705.03995",
  timestamp    = "Sat, 31 Aug 2019 16:23:05 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/LuoFWZHYZ17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{feng-2017-effec-deep,
  author       = "Xiaocheng Feng and Jiang Guo and Bing Qin and Ting Liu and
                  Yongjie Liu",
  title        = "Effective Deep Memory Networks for Distant Supervised
                  Relation Extraction",
  booktitle    = "Proceedings of the Twenty-Sixth International Joint
                  Conference on Artificial Intelligence, {IJCAI-17}",
  pages        = "4002-4008",
  year         = 2017,
  doi          = "10.24963/ijcai.2017/559",
  url          = "https://doi.org/10.24963/ijcai.2017/559"
}

@inproceedings{bordes-2013-transe,
  title        = "Translating embeddings for modeling multi-relational data",
  author       = "Bordes, Antoine and Usunier, Nicolas and Garcia-Duran,
                  Alberto and Weston, Jason and Yakhnenko, Oksana",
  booktitle    = "Advances in neural information processing systems",
  pages        = "2787-2795",
  year         = 2013
}

@inproceedings{wang-2014-transh,
  title        = "Knowledge graph embedding by translating on hyperplanes",
  author       = "Wang, Zhen and Zhang, Jianwen and Feng, Jianlin and Chen,
                  Zheng",
  booktitle    = "Twenty-Eighth AAAI conference on artificial intelligence",
  year         = 2014
}

@inproceedings{lin-2015-transr,
  title        = "Learning entity and relation embeddings for knowledge graph
                  completion",
  author       = "Lin, Yankai and Liu, Zhiyuan and Sun, Maosong and Liu, Yang
                  and Zhu, Xuan",
  booktitle    = "Proceedings of the Twenty-Ninth AAAI Conference on Artificial
                  Intelligence",
  pages        = "2181-2187",
  year         = 2015,
  organization = "AAAI Press"
}

@inproceedings{ji-2015-transd,
  title        = "Knowledge graph embedding via dynamic mapping matrix",
  author       = "Ji, Guoliang and He, Shizhu and Xu, Liheng and Liu, Kang and
                  Zhao, Jun",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics and the 7th International Joint
                  Conference on Natural Language Processing (Volume 1: Long
                  Papers)",
  pages        = "687-696",
  year         = 2015
}

@article{xiao-2015-transa,
  author       = "Han Xiao and Minlie Huang and Yu Hao and Xiaoyan Zhu",
  title        = "TransA: An Adaptive Approach for Knowledge Graph Embedding",
  journal      = "CoRR",
  volume       = "abs/1509.05490",
  year         = 2015,
  url          = "http://arxiv.org/abs/1509.05490",
  archivePrefix= "arXiv",
  eprint       = "1509.05490",
  timestamp    = "Tue, 03 Sep 2019 20:11:19 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/0005HHZ15a",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{ji-2016-transparse,
  title        = "Knowledge graph completion with adaptive sparse transfer
                  matrix",
  author       = "Ji, Guoliang and Liu, Kang and He, Shizhu and Zhao, Jun",
  booktitle    = "Thirtieth AAAI Conference on Artificial Intelligence",
  year         = 2016
}

@inproceedings{xiao-2016-transg,
  title        = "TransG: A generative model for knowledge graph embedding",
  author       = "Xiao, Han and Huang, Minlie and Zhu, Xiaoyan",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  volume       = 1,
  pages        = "2316-2325",
  year         = 2016
}

@inproceedings{he-2015-kg2e,
  title        = "Learning to represent knowledge graphs with gaussian
                  embedding",
  author       = "He, Shizhu and Liu, Kang and Ji, Guoliang and Zhao, Jun",
  booktitle    = "Proceedings of the 24th ACM International on Conference on
                  Information and Knowledge Management",
  pages        = "623-632",
  year         = 2015,
  organization = "ACM"
}

@inproceedings{jia-2016-transa+,
  title        = "Locally adaptive translation for knowledge graph embedding",
  author       = "Jia, Yantao and Wang, Yuanzhuo and Lin, Hailun and Jin,
                  Xiaolong and Cheng, Xueqi",
  booktitle    = "Thirtieth AAAI conference on artificial intelligence",
  year         = 2016
}

@inproceedings{shi-2017-proje,
  title        = "ProjE: Embedding projection for knowledge graph completion",
  author       = "Shi, Baoxu and Weninger, Tim",
  booktitle    = "Thirty-First AAAI Conference on Artificial Intelligence",
  year         = 2017
}

@inproceedings{krompass-2015-type,
  title        = "Type-constrained representation learning in knowledge graphs",
  author       = "Krompa{\ss}, Denis and Baier, Stephan and Tresp, Volker",
  booktitle    = "International semantic web conference",
  pages        = "640-655",
  year         = 2015,
  organization = "Springer"
}

@inproceedings{niu-2011-zhishi,
  title        = "Zhishi.me-weaving chinese linking open data",
  author       = "Niu, Xing and Sun, Xinruo and Wang, Haofen and Rong, Shu and
                  Qi, Guilin and Yu, Yong",
  booktitle    = "International Semantic Web Conference",
  pages        = "205-220",
  year         = 2011,
  organization = "Springer"
}

@incollection{bizer-2011-linked,
  title        = "Linked data: The story so far",
  author       = "Bizer, Christian and Heath, Tom and Berners-Lee, Tim",
  booktitle    = "Semantic services, interoperability and web applications:
                  emerging concepts",
  pages        = "205-227",
  year         = 2011,
  publisher    = "IGI Global"
}

@inproceedings{liu-2017-unsup,
  title        = "Unsupervised image-to-image translation networks",
  author       = "Liu, Ming-Yu and Breuel, Thomas and Kautz, Jan",
  booktitle    = "Advances in neural information processing systems",
  pages        = "700-708",
  year         = 2017
}

@inproceedings{cao-2018-cw2vec,
  title        = "cw2vec: Learning chinese word embeddings with stroke n-gram
                  information",
  author       = "Cao, Shaosheng and Lu, Wei and Zhou, Jun and Li, Xiaolong",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = 2018
}

@ARTICLE{yu-2015-multi-scale,
  author       = "{Yu}, Fisher and {Koltun}, Vladlen",
  title        = "{Multi-Scale Context Aggregation by Dilated Convolutions}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition",
  year         = 2015,
  month        = "Nov",
  eid          = "arXiv:1511.07122",
  pages        = "arXiv:1511.07122",
  archivePrefix= "arXiv",
  eprint       = "1511.07122",
  primaryClass = "cs.CV",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2015arXiv151107122Y",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{chiu-2016-lstm-cnn,
  title        = "Named Entity Recognition with Bidirectional {LSTM}-{CNN}s",
  author       = "Chiu, Jason P.C.  and Nichols, Eric",
  journal      = "Transactions of the Association for Computational
                  Linguistics",
  volume       = 4,
  year         = 2016,
  url          = "https://www.aclweb.org/anthology/Q16-1026",
  doi          = "10.1162/tacl_a_00104",
  pages        = "357-370",
  abstract     = "Named entity recognition is a challenging task that has
                  traditionally required large amounts of knowledge in the form
                  of feature engineering and lexicons to achieve high
                  performance. In this paper, we present a novel neural network
                  architecture that automatically detects word- and
                  character-level features using a hybrid bidirectional LSTM
                  and CNN architecture, eliminating the need for most feature
                  engineering. We also propose a novel method of encoding
                  partial lexicon matches in neural networks and compare it to
                  existing approaches. Extensive evaluation shows that, given
                  only tokenized text and publicly available word embeddings,
                  our system is competitive on the CoNLL-2003 dataset and
                  surpasses the previously reported state of the art
                  performance on the OntoNotes 5.0 dataset by 2.13 F1
                  points. By using two lexicons constructed from
                  publicly-available sources, we establish new state of the art
                  performance with an F1 score of 91.62 on CoNLL-2003 and 86.28
                  on OntoNotes, surpassing systems that employ heavy feature
                  engineering, proprietary lexicons, and rich entity linking
                  information."
}

@inproceedings{zhang-2018-lattice-lstm,
  title        = "{C}hinese {NER} Using Lattice {LSTM}",
  author       = "Zhang, Yue and Yang, Jie",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1144",
  doi          = "10.18653/v1/P18-1144",
  pages        = "1554-1564",
  abstract     = "We investigate a lattice-structured LSTM model for Chinese
                  NER, which encodes a sequence of input characters as well as
                  all potential words that match a lexicon. Compared with
                  character-based methods, our model explicitly leverages word
                  and word sequence information. Compared with word-based
                  methods, lattice LSTM does not suffer from segmentation
                  errors. Gated recurrent cells allow our model to choose the
                  most relevant characters and words from a sentence for better
                  NER results. Experiments on various datasets show that
                  lattice LSTM outperforms both word-based and character-based
                  LSTM baselines, achieving the best results."
}

@article{shang-2018-autoner,
  author       = "Jingbo Shang and Liyuan Liu and Xiang Ren and Xiaotao Gu and
                  Teng Ren and Jiawei Han",
  title        = "Learning Named Entity Tagger using Domain-Specific
                  Dictionary",
  journal      = "CoRR",
  volume       = "abs/1809.03599",
  year         = 2018,
  url          = "http://arxiv.org/abs/1809.03599",
  archivePrefix= "arXiv",
  eprint       = "1809.03599",
  timestamp    = "Fri, 05 Oct 2018 11:34:52 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1809-03599",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{yadav-2018-survey-ner,
  title        = "A Survey on Recent Advances in Named Entity Recognition from
                  Deep Learning models",
  author       = "Yadav, Vikas and Bethard, Steven",
  booktitle    = "Proceedings of the 27th International Conference on
                  Computational Linguistics",
  month        = aug,
  year         = 2018,
  address      = "Santa Fe, New Mexico, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/C18-1182",
  pages        = "2145-2158",
  abstract     = "Named Entity Recognition (NER) is a key component in NLP
                  systems for question answering, information retrieval,
                  relation extraction, etc. NER systems have been studied and
                  developed widely for decades, but accurate systems using deep
                  neural networks (NN) have only been introduced in the last
                  few years. We present a comprehensive survey of deep neural
                  network architectures for NER, and contrast them with
                  previous approaches to NER based on feature engineering and
                  other supervised or semi-supervised learning algorithms. Our
                  results highlight the improvements achieved by neural
                  networks, and show how incorporating some of the lessons
                  learned from past work on feature-based NER systems can yield
                  further improvements."
}

@ARTICLE{li-2016-webqa,
  author       = "{Li}, Peng and {Li}, Wei and {He}, Zhengyan and {Wang},
                  Xuguang and {Cao}, Ying and {Zhou}, Jie and {Xu}, Wei",
  title        = "{Dataset and Neural Recurrent Sequence Labeling Model for
                  Open-Domain Factoid Question Answering}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence, Computer Science - Neural and Evolutionary
  Computing",
  year         = 2016,
  month        = "Jul",
  eid          = "arXiv:1607.06275",
  pages        = "arXiv:1607.06275",
  archivePrefix= "arXiv",
  eprint       = "1607.06275",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160706275L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{wang-2003-risk-score,
  author       = "Wang, Thomas J. and Massaro, Joseph M. and Levy, Daniel and
                  Vasan, Ramachandran S. and Wolf, Philip A. and D'Agostino,
                  Ralph B. and Larson, Martin G. and Kannel, William B. and
                  Benjamin, Emelia J.",
  title        = "{A Risk Score for Predicting Stroke or Death in Individuals
                  With New-Onset Atrial Fibrillation in the CommunityThe
                  Framingham Heart Study}",
  journal      = "JAMA",
  volume       = 290,
  number       = 8,
  pages        = "1049-1056",
  year         = 2003,
  month        = 08,
  abstract     = "{ContextPrior risk stratification schemes for atrial
                  fibrillation (AF) have been based on randomized trial cohorts
                  or Medicare administrative databases, have included patients
                  with established AF, and have focused on stroke as the
                  principal outcome.ObjectiveTo derive risk scores for stroke
                  alone and stroke or death in community-based individuals with
                  new-onset AF.Design, Setting, and ParticipantsProspective,
                  community-based, observational cohort in Framingham, Mass.
                  We identified 868 participants with new-onset AF, 705 of whom
                  were not treated with warfarin at baseline. Risk scores for
                  stroke (ischemic or hemorrhagic) and stroke or death were
                  developed with censoring when warfarin initiation occurred
                  during follow-up. Event rates were examined in low-risk
                  individuals, as defined by the risk score and 4 previously
                  published risk schemes.Main Outcome MeasuresStroke and the
                  combination of stroke or death.ResultsDuring a mean follow-up
                  of 4.0 years free of warfarin use, stroke alone occurred in
                  83 participants and stroke or death occurred in 382
                  participants.  A risk score for stroke was derived that
                  included the following risk predictors: advancing age, female
                  sex, increasing systolic blood pressure, prior stroke or
                  transient ischemic attack, and diabetes. With the risk score,
                  14.3\\% of the cohort had a predicted 5-year stroke rate
                  ≤7.5\\% (average annual rate ≤1.5\\%), and 30.6\\% of the
                  cohort had a predicted 5-year stroke rate ≤10\\% (average
                  annual rate ≤2\\%). Actual stroke rates in these low-risk
                  groups were 1.1 and 1.5 per 100 person-years,
                  respectively. Previous risk schemes classified 6.4\\% to
                  17.3\\% of subjects as low risk, with actual stroke rates of
                  0.9 to 2.3 per 100 person-years. A risk score for stroke or
                  death is also presented.ConclusionThese risk scores can be
                  used to estimate the absolute risk of an adverse event in
                  individuals with AF, which may be helpful in counseling
                  patients and making treatment decisions.}",
  issn         = "0098-7484",
  doi          = "10.1001/jama.290.8.1049",
  url          = "https://doi.org/10.1001/jama.290.8.1049",
  eprint       =
  "https://jamanetwork.com/journals/jama/articlepdf/197176/joc30626.pdf"
}

@inproceedings{khosla-2010-integrated,
  title        = "An integrated machine learning approach to stroke prediction",
  author       = "Khosla, Aditya and Cao, Yu and Lin, Cliff Chiung-Yu and Chiu,
                  Hsu-Kuang and Hu, Junling and Lee, Honglak",
  booktitle    = "Proceedings of the 16th ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  pages        = "183-192",
  year         = 2010,
  organization = "ACM"
}

@inproceedings{cheng-2016-risk,
  title        = "Risk prediction with electronic health records: A deep
                  learning approach",
  author       = "Cheng, Yu and Wang, Fei and Zhang, Ping and Hu, Jianying",
  booktitle    = "Proceedings of the 2016 SIAM International Conference on Data
                  Mining",
  pages        = "432-440",
  year         = 2016,
  organization = "SIAM"
}

@article{choi-2016-using,
  title        = "Using recurrent neural network models for early detection of
                  heart failure onset",
  author       = "Choi, Edward and Schuetz, Andy and Stewart, Walter F and Sun,
                  Jimeng",
  journal      = "Journal of the American Medical Informatics Association",
  volume       = 24,
  number       = 2,
  pages        = "361-370",
  year         = 2016,
  publisher    = "Oxford University Press"
}

@article{rajkomar-2018-scalable,
  title        = "Scalable and accurate deep learning with electronic health
                  records",
  author       = "Rajkomar, Alvin and Oren, Eyal and Chen, Kai and Dai, Andrew
                  M and Hajaj, Nissan and Hardt, Michaela and Liu, Peter J and
                  Liu, Xiaobing and Marcus, Jake and Sun, Mimi and others",
  journal      = "NPJ Digital Medicine",
  volume       = 1,
  number       = 1,
  pages        = 18,
  year         = 2018,
  publisher    = "Nature Publishing Group"
}

@article{shickel-2017-deep-ehr,
  author       = "Benjamin Shickel and Patrick Tighe and Azra Bihorac and
                  Parisa Rashidi",
  title        = "Deep {EHR:} {A} Survey of Recent Advances on Deep Learning
                  Techniques for Electronic Health Record {(EHR)} Analysis",
  journal      = "CoRR",
  volume       = "abs/1706.03446",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.03446",
  archivePrefix= "arXiv",
  eprint       = "1706.03446",
  timestamp    = "Mon, 13 Aug 2018 16:46:19 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ShickelTBR17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{chen-2016-enhan-combin,
  author       = "Qian Chen and Xiaodan Zhu and Zhen{-}Hua Ling and Si Wei and
                  Hui Jiang",
  title        = "Enhancing and Combining Sequential and Tree {LSTM} for
                  Natural Language Inference",
  journal      = "CoRR",
  volume       = "abs/1609.06038",
  year         = 2016,
  url          = "http://arxiv.org/abs/1609.06038",
  archivePrefix= "arXiv",
  eprint       = "1609.06038",
  timestamp    = "Mon, 13 Aug 2018 16:48:17 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ChenZLWJ16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{neculoiu-2016-learn-text,
  title        = "Learning Text Similarity with {S}iamese Recurrent Networks",
  author       = "Neculoiu, Paul and Versteegh, Maarten and Rotaru, Mihai",
  booktitle    = "Proceedings of the 1st Workshop on Representation Learning
                  for {NLP}",
  month        = aug,
  year         = 2016,
  address      = "Berlin, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W16-1617",
  doi          = "10.18653/v1/W16-1617",
  pages        = "148-157"
}

@inproceedings{wang-2018-learn-ask,
  title        = "Learning to Ask Questions in Open-domain Conversational
                  Systems with Typed Decoders",
  author       = "Wang, Yansen and Liu, Chenyi and Huang, Minlie and Nie,
                  Liqiang",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1204",
  doi          = "10.18653/v1/P18-1204",
  pages        = "2193-2203",
  abstract     = "Asking good questions in open-domain conversational systems
                  is quite significant but rather untouched. This task,
                  substantially different from traditional question generation,
                  requires to question not only with various patterns but also
                  on diverse and relevant topics. We observe that a good
                  question is a natural composition of interrogatives, topic
                  words, and ordinary words. Interrogatives lexicalize the
                  pattern of questioning, topic words address the key
                  information for topic transition in dialogue, and ordinary
                  words play syntactical and grammatical roles in making a
                  natural sentence. We devise two typed decoders (soft typed
                  decoder and hard typed decoder) in which a type distribution
                  over the three types is estimated and the type distribution
                  is used to modulate the final generation
                  distribution. Extensive experiments show that the typed
                  decoders outperform state-of-the-art baselines and can
                  generate more meaningful questions."
}

@article{seo-2016-bidaf,
  author       = "Min Joon Seo and Aniruddha Kembhavi and Ali Farhadi and
                  Hannaneh Hajishirzi",
  title        = "Bidirectional Attention Flow for Machine Comprehension",
  journal      = "CoRR",
  volume       = "abs/1611.01603",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.01603",
  archivePrefix= "arXiv",
  eprint       = "1611.01603",
  timestamp    = "Mon, 13 Aug 2018 16:46:34 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/SeoKFH16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{yu-2018-qanet,
  author       = "Adams Wei Yu and David Dohan and Minh{-}Thang Luong and Rui
                  Zhao and Kai Chen and Mohammad Norouzi and Quoc V. Le",
  title        = "QANet: Combining Local Convolution with Global Self-Attention
                  for Reading Comprehension",
  journal      = "CoRR",
  volume       = "abs/1804.09541",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.09541",
  archivePrefix= "arXiv",
  eprint       = "1804.09541",
  timestamp    = "Mon, 13 Aug 2018 16:48:18 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-09541",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{ture-2017-no-need-pay-atten,
  title        = "No Need to Pay Attention: Simple Recurrent Neural Networks
                  Work!",
  author       = "Ture, Ferhan and Jojic, Oliver",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1307",
  doi          = "10.18653/v1/D17-1307",
  pages        = "2866-2872",
  abstract     = "First-order factoid question answering assumes that the
                  question can be answered by a single fact in a knowledge base
                  (KB). While this does not seem like a challenging task, many
                  recent attempts that apply either complex linguistic
                  reasoning or deep neural networks achieve 65{\%}{--}76{\%}
                  accuracy on benchmark sets. Our approach formulates the task
                  as two machine learning problems: detecting the entities in
                  the question, and classifying the question as one of the
                  relation types in the KB. We train a recurrent neural network
                  to solve each problem. On the SimpleQuestions dataset, our
                  approach yields substantial improvements over previously
                  published results {---} even neural networks based on much
                  more complex architectures. The simplicity of our approach
                  also has practical advantages, such as efficiency and
                  modularity, that are valuable especially in an industry
                  setting. In fact, we present a preliminary analysis of the
                  performance of our model on real queries from Comcast{'}s X1
                  entertainment platform with millions of users every day."
}

@inproceedings{yu-2017-improv-neural,
  title        = "Improved Neural Relation Detection for Knowledge Base
                  Question Answering",
  author       = "Yu, Mo and Yin, Wenpeng and Hasan, Kazi Saidul and dos
                  Santos, Cicero and Xiang, Bing and Zhou, Bowen",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2017,
  address      = "Vancouver, Canada",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P17-1053",
  doi          = "10.18653/v1/P17-1053",
  pages        = "571-581",
  abstract     = "Relation detection is a core component of many NLP
                  applications including Knowledge Base Question Answering
                  (KBQA). In this paper, we propose a hierarchical recurrent
                  neural network enhanced by residual learning which detects KB
                  relations given an input question. Our method uses deep
                  residual bidirectional LSTMs to compare questions and
                  relation names via different levels of
                  abstraction. Additionally, we propose a simple KBQA system
                  that integrates entity linking and our proposed relation
                  detector to make the two components enhance each other. Our
                  experimental results show that our approach not only achieves
                  outstanding relation detection performance, but more
                  importantly, it helps our KBQA system achieve
                  state-of-the-art accuracy for both single-relation
                  (SimpleQuestions) and multi-relation (WebQSP) QA benchmarks."
}

@inproceedings{he-2017-gener-natur,
  title        = "Generating Natural Answers by Incorporating Copying and
                  Retrieving Mechanisms in Sequence-to-Sequence Learning",
  author       = "He, Shizhu and Liu, Cao and Liu, Kang and Zhao, Jun",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2017,
  address      = "Vancouver, Canada",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P17-1019",
  doi          = "10.18653/v1/P17-1019",
  pages        = "199-208",
  abstract     = "Generating answer with natural language sentence is very
                  important in real-world question answering systems, which
                  needs to obtain a right answer as well as a coherent natural
                  response. In this paper, we propose an end-to-end question
                  answering system called COREQA in sequence-to-sequence
                  learning, which incorporates copying and retrieving
                  mechanisms to generate natural answers within an
                  encoder-decoder framework. Specifically, in COREQA, the
                  semantic units (words, phrases and entities) in a natural
                  answer are dynamically predicted from the vocabulary, copied
                  from the given question and/or retrieved from the
                  corresponding knowledge base jointly. Our empirical study on
                  both synthetic and real-world datasets demonstrates the
                  efficiency of COREQA, which is able to generate correct,
                  coherent and natural answers for knowledge inquired
                  questions."
}

@inproceedings{madotto-2018-mem2seq,
  title        = "{M}em2{S}eq: Effectively Incorporating Knowledge Bases into
                  End-to-End Task-Oriented Dialog Systems",
  author       = "Madotto, Andrea and Wu, Chien-Sheng and Fung, Pascale",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1136",
  doi          = "10.18653/v1/P18-1136",
  pages        = "1468-1478",
  abstract     = "End-to-end task-oriented dialog systems usually suffer from
                  the challenge of incorporating knowledge bases. In this
                  paper, we propose a novel yet simple end-to-end
                  differentiable model called memory-to-sequence (Mem2Seq) to
                  address this issue. Mem2Seq is the first neural generative
                  model that combines the multi-hop attention over memories
                  with the idea of pointer network. We empirically show how
                  Mem2Seq controls each generation step, and how its multi-hop
                  attention mechanism helps in learning correlations between
                  memories. In addition, our model is quite general without
                  complicated task-specific designs. As a result, we show that
                  Mem2Seq can be trained faster and attain the state-of-the-art
                  performance on three different task-oriented dialog
                  datasets."
}

@article{cheng-2016-wide-deep,
  author       = "Heng{-}Tze Cheng and Levent Koc and Jeremiah Harmsen and Tal
                  Shaked and Tushar Chandra and Hrishi Aradhye and Glen
                  Anderson and Greg Corrado and Wei Chai and Mustafa Ispir and
                  Rohan Anil and Zakaria Haque and Lichan Hong and Vihan Jain
                  and Xiaobing Liu and Hemal Shah",
  title        = "Wide {\&} Deep Learning for Recommender Systems",
  journal      = "CoRR",
  volume       = "abs/1606.07792",
  year         = 2016,
  url          = "http://arxiv.org/abs/1606.07792",
  archivePrefix= "arXiv",
  eprint       = "1606.07792",
  timestamp    = "Mon, 13 Aug 2018 16:47:53 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ChengKHSCAACCIA16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{wang-2019-multi-passage-bert,
  author       = "{Wang}, Zhiguo and {Ng}, Patrick and {Ma}, Xiaofei and
                  {Nallapati}, Ramesh and {Xiang}, Bing",
  title        = "{Multi-passage BERT: A Globally Normalized BERT Model for
                  Open-domain Question Answering}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Artificial Intelligence",
  year         = 2019,
  month        = "Aug",
  eid          = "arXiv:1908.08167",
  pages        = "arXiv:1908.08167",
  archivePrefix= "arXiv",
  eprint       = "1908.08167",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190808167W",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{sun-2019-how-fine,
  author       = "Chi Sun and Xipeng Qiu and Yige Xu and Xuanjing Huang",
  title        = "How to Fine-Tune {BERT} for Text Classification?",
  journal      = "CoRR",
  volume       = "abs/1905.05583",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.05583",
  archivePrefix= "arXiv",
  eprint       = "1905.05583",
  timestamp    = "Tue, 28 May 2019 12:48:08 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1905-05583",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{diefenbach-2018-core-techn,
  author       = "Diefenbach, Dennis and Lopez, Vanessa and Singh, Kamal and
                  Maret, Pierre",
  title        = "Core Techniques of Question Answering Systems over Knowledge
                  Bases: A Survey",
  journal      = "Knowl. Inf. Syst.",
  issue_date   = "June 2018",
  volume       = 55,
  number       = 3,
  month        = jun,
  year         = 2018,
  issn         = "0219-1377",
  pages        = "529-569",
  numpages     = 41,
  url          = "https://doi.org/10.1007/s10115-017-1100-y",
  doi          = "10.1007/s10115-017-1100-y",
  acmid        = 3210959,
  publisher    = "Springer-Verlag",
  address      = "Berlin, Heidelberg",
  keywords     = "Knowledge base, QALD, Question answering, Semantic Web,
                  SimpleQuestions, Survey, WebQuestions"
}

@inproceedings{rakthanmanon-2012-search,
  title        = "Searching and mining trillions of time series subsequences
                  under dynamic time warping",
  author       = "Rakthanmanon, Thanawin and Campana, Bilson and Mueen,
                  Abdullah and Batista, Gustavo and Westover, Brandon and Zhu,
                  Qiang and Zakaria, Jesin and Keogh, Eamonn",
  booktitle    = "Proceedings of the 18th ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  pages        = "262-270",
  year         = 2012,
  organization = "ACM"
}

@inproceedings{palatucci-2009-zero-shot,
  title        = "Zero-shot learning with semantic output codes",
  author       = "Palatucci, Mark and Pomerleau, Dean and Hinton, Geoffrey E
                  and Mitchell, Tom M",
  booktitle    = "Advances in neural information processing systems",
  pages        = "1410-1418",
  year         = 2009
}

@article{fei-fei-2006-one-shot,
  title        = "One-shot learning of object categories",
  author       = "Fei-Fei, Li and Fergus, Rob and Perona, Pietro",
  journal      = "IEEE transactions on pattern analysis and machine
                  intelligence",
  volume       = 28,
  number       = 4,
  pages        = "594-611",
  year         = 2006,
  publisher    = "IEEE"
}

@inproceedings{ganin-2015-unsup-domain-adapt-backp,
  author       = "Ganin, Yaroslav and Lempitsky, Victor",
  title        = "Unsupervised Domain Adaptation by Backpropagation",
  booktitle    = "Proceedings of the 32Nd International Conference on
                  International Conference on Machine Learning - Volume 37",
  series       = "ICML'15",
  year         = 2015,
  location     = "Lille, France",
  pages        = "1180-1189",
  numpages     = 10,
  url          = "http://dl.acm.org/citation.cfm?id=3045118.3045244",
  acmid        = 3045244,
  publisher    = "JMLR.org"
}

@Article{liu-2018-multi-relations,
  author       = "Liu, Jin and Ren, Haoliang and Wu, Menglong and Wang, Jin and
                  Kim, Hye-jin",
  title        = "Multiple relations extraction among multiple entities in
                  unstructured text",
  journal      = "Soft Computing",
  year         = 2018,
  month        = "Jul",
  day          = 01,
  volume       = 22,
  number       = 13,
  pages        = "4295-4305",
  abstract     = "Relations extraction is a widely researched topic in nature
                  language processing. However, most of the work in the
                  literature concentrate on the methods that are dealing with
                  single relation between two named entities. In the task of
                  multiple relations extraction, traditional statistic-based
                  methods have difficulties in selecting features and improving
                  the performance of extraction model. In this paper, we
                  presented formal definitions of multiple entities and
                  multiple relations and put forward three labeling methods
                  which were used to label entity categories, relation
                  categories and relation conditions. We also proposed a novel
                  relation extraction model which is based on dynamic long
                  short-term memory network. To train our model, entity
                  feature, entity position feature and part of speech feature
                  are used together. These features are used to describe
                  complex relations and improve the performance of relation
                  extraction model. In the experiments, we classified the
                  corpus into three sets which are composed of 0--20 words,
                  20--35 words and 35+ words sentences. On conll04.corp, the
                  final precision, recall rate and F-measure reached 72.9, 70.8
                  and 67.9{\%} respectively.",
  issn         = "1433-7479",
  doi          = "10.1007/s00500-017-2852-8",
  url          = "https://doi.org/10.1007/s00500-017-2852-8"
}

@article{bekoulis-2018-joint-entity,
  author       = "Giannis Bekoulis and Johannes Deleu and Thomas Demeester and
                  Chris Develder",
  title        = "Joint entity recognition and relation extraction as a
                  multi-head selection problem",
  journal      = "CoRR",
  volume       = "abs/1804.07847",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.07847",
  archivePrefix= "arXiv",
  eprint       = "1804.07847",
  timestamp    = "Mon, 13 Aug 2018 16:49:03 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-07847",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{wang-2019-one-pass,
  author       = "Haoyu Wang and Ming Tan and Mo Yu and Shiyu Chang and Dakuo
                  Wang and Kun Xu and Xiaoxiao Guo and Saloni Potdar",
  title        = "Extracting Multiple-Relations in One-Pass with Pre-Trained
                  Transformers",
  journal      = "CoRR",
  volume       = "abs/1902.01030",
  year         = 2019,
  url          = "http://arxiv.org/abs/1902.01030",
  archivePrefix= "arXiv",
  eprint       = "1902.01030",
  timestamp    = "Tue, 21 May 2019 18:03:37 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1902-01030",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{soares-2019-match-blank,
  author       = "Livio Baldini Soares and Nicholas FitzGerald and Jeffrey Ling
                  and Tom Kwiatkowski",
  title        = "Matching the Blanks: Distributional Similarity for Relation
                  Learning",
  journal      = "CoRR",
  volume       = "abs/1906.03158",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.03158",
  archivePrefix= "arXiv",
  eprint       = "1906.03158",
  timestamp    = "Fri, 14 Jun 2019 09:38:24 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1906-03158",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{alt-2019-improv-relat,
  author       = "Christoph Alt and Marc H{\"{u}}bner and Leonhard Hennig",
  title        = "Improving Relation Extraction by Pre-trained Language
                  Representations",
  journal      = "CoRR",
  volume       = "abs/1906.03088",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.03088",
  archivePrefix= "arXiv",
  eprint       = "1906.03088",
  timestamp    = "Fri, 14 Jun 2019 09:38:24 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1906-03088",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{shi-2019-simple-bert,
  author       = "Peng Shi and Jimmy Lin",
  title        = "Simple {BERT} Models for Relation Extraction and Semantic
                  Role Labeling",
  journal      = "CoRR",
  volume       = "abs/1904.05255",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.05255",
  archivePrefix= "arXiv",
  eprint       = "1904.05255",
  timestamp    = "Thu, 25 Apr 2019 13:55:01 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1904-05255",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{yao-2019-docred,
  author       = "Yuan Yao and Deming Ye and Peng Li and Xu Han and Yankai Lin
                  and Zhenghao Liu and Zhiyuan Liu and Lixin Huang and Jie Zhou
                  and Maosong Sun",
  title        = "DocRED: {A} Large-Scale Document-Level Relation Extraction
                  Dataset",
  journal      = "CoRR",
  volume       = "abs/1906.06127",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.06127",
  archivePrefix= "arXiv",
  eprint       = "1906.06127",
  timestamp    = "Tue, 23 Jul 2019 15:49:40 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1906-06127",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{fu-2019-graphrel,
  title        = "{G}raph{R}el: Modeling Text as Relational Graphs for Joint
                  Entity and Relation Extraction",
  author       = "Fu, Tsu-Jui and Li, Peng-Hsuan and Ma, Wei-Yun",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1136",
  doi          = "10.18653/v1/P19-1136",
  pages        = "1409-1418",
  abstract     = "In this paper, we present GraphRel, an end-to-end relation
                  extraction model which uses graph convolutional networks
                  (GCNs) to jointly learn named entities and relations. In
                  contrast to previous baselines, we consider the interaction
                  between named entities and relations via a 2nd-phase
                  relation-weighted GCN to better extract relations. Linear and
                  dependency structures are both used to extract both
                  sequential and regional features of the text, and a complete
                  word graph is further utilized to extract implicit features
                  among all word pairs of the text. With the graph-based
                  approach, the prediction for overlapping relations is
                  substantially improved over previous sequential
                  approaches. We evaluate GraphRel on two public datasets: NYT
                  and WebNLG. Results show that GraphRel maintains high
                  precision while increasing recall substantially. Also,
                  GraphRel outperforms previous work by 3.2{\%} and 5.8{\%} (F1
                  score), achieving a new state-of-the-art for relation
                  extraction."
}

@article{quirk-2016-distan-super,
  author       = "Chris Quirk and Hoifung Poon",
  title        = "Distant Supervision for Relation Extraction beyond the
                  Sentence Boundary",
  journal      = "CoRR",
  volume       = "abs/1609.04873",
  year         = 2016,
  url          = "http://arxiv.org/abs/1609.04873",
  archivePrefix= "arXiv",
  eprint       = "1609.04873",
  timestamp    = "Mon, 13 Aug 2018 16:49:11 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/QuirkP16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{peng-2017-cross-sentence,
  author       = "Nanyun Peng and Hoifung Poon and Chris Quirk and Kristina
                  Toutanova and Wen{-}tau Yih",
  title        = "Cross-Sentence N-ary Relation Extraction with Graph LSTMs",
  journal      = "CoRR",
  volume       = "abs/1708.03743",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.03743",
  archivePrefix= "arXiv",
  eprint       = "1708.03743",
  timestamp    = "Mon, 13 Aug 2018 16:48:58 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1708-03743",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{song-2018-n-ary,
  author       = "Linfeng Song and Yue Zhang and Zhiguo Wang and Daniel Gildea",
  title        = "N-ary Relation Extraction using Graph State {LSTM}",
  journal      = "CoRR",
  volume       = "abs/1808.09101",
  year         = 2018,
  url          = "http://arxiv.org/abs/1808.09101",
  archivePrefix= "arXiv",
  eprint       = "1808.09101",
  timestamp    = "Mon, 03 Sep 2018 13:36:40 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1808-09101",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{zhang-2019-drug-drug,
  author       = "Zhang, Tianlin and Leng, Jiaxu and Liu, Ying",
  title        = "{Deep learning for drug–drug interaction extraction from the
                  literature: a review}",
  journal      = "Briefings in Bioinformatics",
  year         = 2019,
  month        = 11,
  abstract     = "{Drug–drug interactions (DDIs) are crucial for drug research
                  and pharmacovigilance. These interactions may cause adverse
                  drug effects that threaten public health and patient
                  safety. Therefore, the DDIs extraction from biomedical
                  literature has been widely studied and emphasized in modern
                  biomedical research. The previous rules-based and machine
                  learning approaches rely on tedious feature engineering,
                  which is labourious, time-consuming and unsatisfactory. With
                  the development of deep learning technologies, this problem
                  is alleviated by learning feature representations
                  automatically. Here, we review the recent deep learning
                  methods that have been applied to the extraction of DDIs from
                  biomedical literature. We describe each method briefly and
                  compare its performance in the DDI corpus
                  systematically. Next, we summarize the advantages and
                  disadvantages of these deep learning models for this
                  task. Furthermore, we discuss some challenges and future
                  perspectives of DDI extraction via deep learning
                  methods. This review aims to serve as a useful guide for
                  interested researchers to further advance bioinformatics
                  algorithms for DDIs extraction from the literature.}",
  issn         = "1477-4054",
  doi          = "10.1093/bib/bbz087",
  url          = "https://doi.org/10.1093/bib/bbz087",
  note         = "bbz087",
  eprint       =
  "http://oup.prod.sis.lan/bib/advance-article-pdf/doi/10.1093/bib/bbz087/30342664/bbz087.pdf"
}

@article{zheng-2017-joint-entity,
  title        = "Joint entity and relation extraction based on a hybrid neural
                  network",
  author       = "Zheng, Suncong and Hao, Yuexing and Lu, Dongyuan and Bao,
                  Hongyun and Xu, Jiaming and Hao, Hongwei and Xu, Bo",
  journal      = "Neurocomputing",
  volume       = 257,
  pages        = "59-66",
  year         = 2017,
  publisher    = "Elsevier"
}

@article{li-2017-neural-joint,
  title        = "A neural joint model for entity and relation extraction from
                  biomedical text",
  author       = "Li, Fei and Zhang, Meishan and Fu, Guohong and Ji, Donghong",
  journal      = "BMC bioinformatics",
  volume       = 18,
  number       = 1,
  pages        = 198,
  year         = 2017,
  publisher    = "BioMed Central"
}

@inproceedings{bekoulis-2018-adver,
  title        = "Adversarial training for multi-context joint entity and
                  relation extraction",
  author       = "Bekoulis, Giannis and Deleu, Johannes and Demeester, Thomas
                  and Develder, Chris",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1307",
  doi          = "10.18653/v1/D18-1307",
  pages        = "2830-2836",
  abstract     = "Adversarial training (AT) is a regularization method that can
                  be used to improve the robustness of neural network methods
                  by adding small perturbations in the training data. We show
                  how to use AT for the tasks of entity recognition and
                  relation extraction. In particular, we demonstrate that
                  applying AT to a general purpose baseline model for jointly
                  extracting entities and relations, allows improving the
                  state-of-the-art effectiveness on several datasets in
                  different contexts (i.e., news, biomedical, and real estate
                  data) and for different languages (English and Dutch)."
}

@inproceedings{verga-2018-simul-self,
  title        = "Simultaneously Self-Attending to All Mentions for
                  Full-Abstract Biological Relation Extraction",
  author       = "Verga, Patrick and Strubell, Emma and McCallum, Andrew",
  booktitle    = "Proceedings of the 2018 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long Papers)",
  month        = jun,
  year         = 2018,
  address      = "New Orleans, Louisiana",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N18-1080",
  doi          = "10.18653/v1/N18-1080",
  pages        = "872-884",
  abstract     = "Most work in relation extraction forms a prediction by
                  looking at a short span of text within a single sentence
                  containing a single entity pair mention. This approach often
                  does not consider interactions across mentions, requires
                  redundant computation for each mention pair, and ignores
                  relationships expressed across sentence boundaries. These
                  problems are exacerbated by the document- (rather than
                  sentence-) level annotation common in biological text. In
                  response, we propose a model which simultaneously predicts
                  relationships between all mention pairs in a document. We
                  form pairwise predictions over entire paper abstracts using
                  an efficient self-attention encoder. All-pairs mention scores
                  allow us to perform multi-instance learning by aggregating
                  over mentions to form entity pair representations. We further
                  adapt to settings without mention-level annotation by jointly
                  training to predict named entities and adding a corpus of
                  weakly labeled data. In experiments on two Biocreative
                  benchmark datasets, we achieve state of the art performance
                  on the Biocreative V Chemical Disease Relation dataset for
                  models without external KB resources. We also introduce a new
                  dataset an order of magnitude larger than existing
                  human-annotated biological information extraction datasets
                  and more accurate than distantly supervised alternatives."
}

@article{nguyen-2018-end-to-end,
  author       = "Dat Quoc Nguyen and Karin Verspoor",
  title        = "End-to-end neural relation extraction using deep biaffine
                  attention",
  journal      = "CoRR",
  volume       = "abs/1812.11275",
  year         = 2018,
  url          = "http://arxiv.org/abs/1812.11275",
  archivePrefix= "arXiv",
  eprint       = "1812.11275",
  timestamp    = "Wed, 02 Jan 2019 14:40:18 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1812-11275",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{yan-2019-unified-model,
  author       = "Hang Yan and Xipeng Qiu and Xuanjing Huang",
  title        = "A Unified Model for Joint Chinese Word Segmentation and
                  Dependency Parsing",
  journal      = "CoRR",
  volume       = "abs/1904.04697",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.04697",
  archivePrefix= "arXiv",
  eprint       = "1904.04697",
  timestamp    = "Thu, 25 Apr 2019 13:55:01 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1904-04697",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{zhang-2017-slot-filling,
  title        = "Position-aware Attention and Supervised Data Improve Slot
                  Filling",
  author       = "Zhang, Yuhao and Zhong, Victor and Chen, Danqi and Angeli,
                  Gabor and Manning, Christopher D.",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1004",
  doi          = "10.18653/v1/D17-1004",
  pages        = "35-45",
  abstract     = "Organized relational knowledge in the form of {``}knowledge
                  graphs{''} is important for many applications. However, the
                  ability to populate knowledge bases with facts automatically
                  extracted from documents has improved frustratingly
                  slowly. This paper simultaneously addresses two issues that
                  have held back prior work. We first propose an effective new
                  model, which combines an LSTM sequence model with a form of
                  entity position-aware attention that is better suited to
                  relation extraction. Then we build TACRED, a large (119,474
                  examples) supervised relation extraction dataset obtained via
                  crowdsourcing and targeted towards TAC KBP relations. The
                  combination of better supervised data and a more appropriate
                  high-capacity model enables much better relation extraction
                  performance. When the model trained on this new dataset
                  replaces the previous relation extraction component of the
                  best TAC KBP 2015 slot filling system, its F1 score increases
                  markedly from 22.2{\%} to 26.7{\%}."
}

@inproceedings{han-2018-fewrel,
  title        = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation
                  Classification Dataset with State-of-the-Art Evaluation",
  author       = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao,
                  Yuan and Liu, Zhiyuan and Sun, Maosong",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1514",
  doi          = "10.18653/v1/D18-1514",
  pages        = "4803-4809",
  abstract     = "We present a Few-Shot Relation Classification Dataset
                  (dataset), consisting of 70, 000 sentences on 100 relations
                  derived from Wikipedia and annotated by crowdworkers. The
                  relation of each sentence is first recognized by distant
                  supervision methods, and then filtered by crowdworkers. We
                  adapt the most recent state-of-the-art few-shot learning
                  methods for relation classification and conduct thorough
                  evaluation of these methods. Empirical results show that even
                  the most competitive few-shot learning models struggle on
                  this task, especially as compared with humans. We also show
                  that a range of different reasoning skills are needed to
                  solve our task. These results indicate that few-shot relation
                  classification remains an open problem and still requires
                  further research. Our detailed analysis points multiple
                  directions for future research."
}

@article{levy-2017-zero-shot,
  author       = "Omer Levy and Minjoon Seo and Eunsol Choi and Luke
                  Zettlemoyer",
  title        = "Zero-Shot Relation Extraction via Reading Comprehension",
  journal      = "CoRR",
  volume       = "abs/1706.04115",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.04115",
  archivePrefix= "arXiv",
  eprint       = "1706.04115",
  timestamp    = "Mon, 13 Aug 2018 16:46:48 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/LevySCZ17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{gao-2019-fewrel-2,
  title        = "{F}ew{R}el 2.0: Towards More Challenging Few-Shot Relation
                  Classification",
  author       = "Gao, Tianyu and Han, Xu and Zhu, Hao and Liu, Zhiyuan and Li,
                  Peng and Sun, Maosong and Zhou, Jie",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1649",
  doi          = "10.18653/v1/D19-1649",
  pages        = "6251-6256",
  abstract     = "We present FewRel 2.0, a more challenging task to investigate
                  two aspects of few-shot relation classification models: (1)
                  Can they adapt to a new domain with only a handful of
                  instances? (2) Can they detect none-of-the-above (NOTA)
                  relations? To construct FewRel 2.0, we build upon the FewRel
                  dataset by adding a new test set in a quite different domain,
                  and a NOTA relation choice. With the new dataset and
                  extensive experimental analysis, we found (1) that the
                  state-of-the-art few-shot relation classification models
                  struggle on these two aspects, and (2) that the commonly-used
                  techniques for domain adaptation and NOTA detection still
                  cannot handle the two challenges well. Our research calls for
                  more attention and further efforts to these two real-world
                  issues. All details and resources about the dataset and
                  baselines are released at https://github.com/thunlp/fewrel."
}

@article{snell-2017-prototypical-networks,
  author       = "Jake Snell and Kevin Swersky and Richard S. Zemel",
  title        = "Prototypical Networks for Few-shot Learning",
  journal      = "CoRR",
  volume       = "abs/1703.05175",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.05175",
  archivePrefix= "arXiv",
  eprint       = "1703.05175",
  timestamp    = "Mon, 13 Aug 2018 16:46:05 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/SnellSZ17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{cui-2019-bilstm-lan,
  author       = "{Cui}, Leyang and {Zhang}, Yue",
  title        = "{Hierarchically-Refined Label Attention Network for Sequence
                  Labeling}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2019,
  month        = "Aug",
  eid          = "arXiv:1908.08676",
  pages        = "arXiv:1908.08676",
  archivePrefix= "arXiv",
  eprint       = "1908.08676",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190808676C",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{xu-2019-scalin-open,
  title        = "Scaling up Open Tagging from Tens to Thousands: Comprehension
                  Empowered Attribute Value Extraction from Product Title",
  author       = "Xu, Huimin and Wang, Wenting and Mao, Xin and Jiang, Xinyu
                  and Lan, Man",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1514",
  doi          = "10.18653/v1/P19-1514",
  pages        = "5214-5223",
  abstract     = "Supplementing product information by extracting attribute
                  values from title is a crucial task in e-Commerce
                  domain. Previous studies treat each attribute only as an
                  entity type and build one set of NER tags (e.g., BIO) for
                  each of them, leading to a scalability issue which unfits to
                  the large sized attribute system in real world e-Commerce. In
                  this work, we propose a novel approach to support value
                  extraction scaling up to thousands of attributes without
                  losing performance: (1) We propose to regard attribute as a
                  query and adopt only one global set of BIO tags for any
                  attributes to reduce the burden of attribute tag or model
                  explosion; (2) We explicitly model the semantic
                  representations for attribute and title, and develop an
                  attention mechanism to capture the interactive semantic
                  relations in-between to enforce our framework to be attribute
                  comprehensive. We conduct extensive experiments in real-life
                  datasets. The results show that our model not only
                  outperforms existing state-of-the-art NER tagging models, but
                  also is robust and generates promising results for up to
                  8,906 attributes."
}

@article{zheng-2018-opentag,
  author       = "Guineng Zheng and Subhabrata Mukherjee and Xin Luna Dong and
                  Feifei Li",
  title        = "OpenTag: Open Attribute Value Extraction from Product
                  Profiles",
  journal      = "CoRR",
  volume       = "abs/1806.01264",
  year         = 2018,
  url          = "http://arxiv.org/abs/1806.01264",
  archivePrefix= "arXiv",
  eprint       = "1806.01264",
  timestamp    = "Mon, 13 Aug 2018 16:46:56 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1806-01264",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{yahya-2014-renoun,
  title        = "{R}e{N}oun: Fact Extraction for Nominal Attributes",
  author       = "Yahya, Mohamed and Whang, Steven and Gupta, Rahul and Halevy,
                  Alon",
  booktitle    = "Proceedings of the 2014 Conference on Empirical Methods in
                  Natural Language Processing ({EMNLP})",
  month        = oct,
  year         = 2014,
  address      = "Doha, Qatar",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D14-1038",
  doi          = "10.3115/v1/D14-1038",
  pages        = "325-335"
}

@article{jiang-2017-metapad,
  author       = "Meng Jiang and Jingbo Shang and Taylor Cassidy and Xiang Ren
                  and Lance M. Kaplan and Timothy P. Hanratty and Jiawei Han",
  title        = "MetaPAD: Meta Pattern Discovery from Massive Text Corpora",
  journal      = "CoRR",
  volume       = "abs/1703.04213",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.04213",
  archivePrefix= "arXiv",
  eprint       = "1703.04213",
  timestamp    = "Mon, 13 Aug 2018 16:48:27 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/JiangSCRKHH17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{zhang-2019-unsup-annot,
  author       = "{Zhang}, Jingqing and {Zhang}, Xiaoyu and {Sun}, Kai and
                  {Yang}, Xian and {Dai}, Chengliang and {Guo}, Yike",
  title        = "{Unsupervised Annotation of Phenotypic Abnormalities via
                  Semantic Latent Representations on Electronic Health
                  Records}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2019,
  month        = "Nov",
  eid          = "arXiv:1911.03862",
  pages        = "arXiv:1911.03862",
  archivePrefix= "arXiv",
  eprint       = "1911.03862",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv191103862Z",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{liu-2015-segphrase,
  title        = "Mining quality phrases from massive text corpora",
  author       = "Liu, Jialu and Shang, Jingbo and Wang, Chi and Ren, Xiang and
                  Han, Jiawei",
  booktitle    = "Proceedings of the 2015 ACM SIGMOD International Conference
                  on Management of Data",
  pages        = "1729-1744",
  year         = 2015,
  organization = "ACM"
}

@article{shang-2017-autophrase,
  author       = "Jingbo Shang and Jialu Liu and Meng Jiang and Xiang Ren and
                  Clare R. Voss and Jiawei Han",
  title        = "Automated Phrase Mining from Massive Text Corpora",
  journal      = "CoRR",
  volume       = "abs/1702.04457",
  year         = 2017,
  url          = "http://arxiv.org/abs/1702.04457",
  archivePrefix= "arXiv",
  eprint       = "1702.04457",
  timestamp    = "Mon, 13 Aug 2018 16:46:43 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ShangLJRVH17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{el-kishky-2014-topmining,
  title        = "Scalable topical phrase mining from text corpora",
  author       = "El-Kishky, Ahmed and Song, Yanglei and Wang, Chi and Voss,
                  Clare R and Han, Jiawei",
  journal      = "Proceedings of the VLDB Endowment",
  volume       = 8,
  number       = 3,
  pages        = "305-316",
  year         = 2014,
  publisher    = "VLDB Endowment"
}

@inproceedings{wang-2019-autobioner,
  title        = "Distantly Supervised Biomedical Named Entity Recognition with
                  Dictionary Expansion",
  author       = "Wang, Xuan and Zhang, Yu and Li, Qi and Ren, Xiang and Shang,
                  Jingbo and Han, Jiawei",
  booktitle    = "Proc. 2019 IEEE Int. Conf. on Bioinformatics and Biomedicine
                  (IEEE-BIBM’19), San Diego, CA",
  year         = 2019
}

@inproceedings{shen-2017-setexpan,
  title        = "Setexpan: Corpus-based set expansion via context feature
                  selection and rank ensemble",
  author       = "Shen, Jiaming and Wu, Zeqiu and Lei, Dongming and Shang,
                  Jingbo and Ren, Xiang and Han, Jiawei",
  booktitle    = "Joint European Conference on Machine Learning and Knowledge
                  Discovery in Databases",
  pages        = "288-304",
  year         = 2017,
  organization = "Springer"
}

@inproceedings{rong-2016-egoset,
  title        = "Egoset: Exploiting word ego-networks and user-generated
                  ontology for multifaceted set expansion",
  author       = "Rong, Xin and Chen, Zhe and Mei, Qiaozhu and Adar, Eytan",
  booktitle    = "Proceedings of the Ninth ACM international conference on Web
                  search and data mining",
  pages        = "645-654",
  year         = 2016,
  organization = "ACM"
}

@inproceedings{lin-2008-textcube,
  author       = "Lin, Cindy Xide and Ding, Bolin and Han, Jiawei and Zhu,
                  Feida and Zhao, Bo",
  title        = "Text cube: Computing ir measures for multidimensional text
                  database analysis",
  booktitle    = "2008 Eighth IEEE International Conference on Data Mining",
  year         = 2008,
  pages        = "905-910",
  organization = "IEEE"
}

@article{sun-2011-pathsim,
  author       = "Sun, Yizhou and Han, Jiawei and Yan, Xifeng and Yu, Philip S
                  and Wu, Tianyi",
  title        = "Pathsim: Meta Path-Based Top-K Similarity Search in
                  Heterogeneous Information Networks",
  journal      = "Proceedings of the VLDB Endowment",
  volume       = 4,
  number       = 11,
  pages        = "992-1003",
  year         = 2011,
  publisher    = "Citeseer"
}

@inproceedings{ren-2015-clustype,
  author       = "Ren, Xiang and El-Kishky, Ahmed and Wang, Chi and Tao, Fangbo
                  and Voss, Clare R and Han, Jiawei",
  title        = "Clustype: Effective entity recognition and typing by relation
                  phrase-based clustering",
  booktitle    = "Proceedings of the 21th ACM SIGKDD International Conference
                  on Knowledge Discovery and Data Mining",
  year         = 2015,
  pages        = "995-1004",
  organization = "ACM"
}

@article{ren-2016-cotype,
  author       = "Xiang Ren and Zeqiu Wu and Wenqi He and Meng Qu and Clare
                  R. Voss and Heng Ji and Tarek F. Abdelzaher and Jiawei Han",
  title        = "CoType: Joint Extraction of Typed Entities and Relations with
                  Knowledge Bases",
  journal      = "CoRR",
  volume       = "abs/1610.08763",
  year         = 2016,
  url          = "http://arxiv.org/abs/1610.08763",
  archivePrefix= "arXiv",
  eprint       = "1610.08763",
  timestamp    = "Mon, 13 Aug 2018 16:46:29 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/RenWHQVJAH16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{ren-2016-afet,
  title        = "{AFET}: Automatic Fine-Grained Entity Typing by Hierarchical
                  Partial-Label Embedding",
  author       = "Ren, Xiang and He, Wenqi and Qu, Meng and Huang, Lifu and Ji,
                  Heng and Han, Jiawei",
  booktitle    = "Proceedings of the 2016 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = nov,
  year         = 2016,
  address      = "Austin, Texas",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D16-1144",
  doi          = "10.18653/v1/D16-1144",
  pages        = "1369-1378"
}

@article{liu-2017-rehession,
  author       = "Liu, Liyuan and Ren, Xiang and Zhu, Qi and Zhi, Shi and Gui,
                  Huan and Ji, Heng and Han, Jiawei",
  title        = "Heterogeneous Supervision for Relation Extraction: a
                  Representation Learning Approach",
  journal      = "arXiv preprint arXiv:1707.00166",
  year         = 2017
}

@inproceedings{ren-2016-ple,
  author       = "Ren, Xiang and He, Wenqi and Qu, Meng and Voss, Clare R and
                  Ji, Heng and Han, Jiawei",
  title        = "Label noise reduction in entity typing by heterogeneous
                  partial-label embedding",
  booktitle    = "Proceedings of the 22nd ACM SIGKDD international conference
                  on Knowledge discovery and data mining",
  year         = 2016,
  pages        = "1825-1834",
  organization = "ACM"
}

@inproceedings{qu-2017-auto-synonym,
  title        = "Automatic synonym discovery with knowledge bases",
  author       = "Qu, Meng and Ren, Xiang and Han, Jiawei",
  booktitle    = "Proceedings of the 23rd ACM SIGKDD International Conference
                  on Knowledge Discovery and Data Mining",
  pages        = "997-1005",
  year         = 2017,
  organization = "ACM"
}

@article{tao-2016-textcube-summarization,
  title        = "Multi-Dimensional, Phrase-Based Summarization in Text Cubes",
  author       = "Fangbo Tao and Honglei Zhuang and Chi Wang Yu and Qi Wang and
                  Taylor Cassidy and Lance M. Kaplan and Clare R. Voss and
                  Jiawei Han",
  journal      = "IEEE Data Eng. Bull.",
  year         = 2016,
  volume       = 39,
  pages        = "74-84"
}

@inproceedings{liu-2016-laki,
  title        = "Representing documents via latent keyphrase inference",
  author       = "Liu, Jialu and Ren, Xiang and Shang, Jingbo and Cassidy,
                  Taylor and Voss, Clare R and Han, Jiawei",
  booktitle    = "Proceedings of the 25th international conference on World
                  wide web",
  pages        = "1057-1067",
  year         = 2016,
  organization = "International World Wide Web Conferences Steering Committee"
}

@article{hosseini-2018-heteromed,
  author       = "Anahita Hosseini and Ting Chen and Wenjun Wu and Yizhou Sun
                  and Majid Sarrafzadeh",
  title        = "HeteroMed: Heterogeneous Information Network for Medical
                  Diagnosis",
  journal      = "CoRR",
  volume       = "abs/1804.08052",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.08052",
  archivePrefix= "arXiv",
  eprint       = "1804.08052",
  timestamp    = "Wed, 17 Apr 2019 16:16:59 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1804-08052",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{han-2017-mining-structs,
  author       = "Han, Jiawei",
  title        = "Mining Structures from Massive Text Data: A Data-Driven
                  Approach.",
  booktitle    = "SIMBig",
  year         = 2017,
  pages        = "16-19"
}

@article{gui-2018-exper-findin,
  author       = "Huan Gui and Qi Zhu and Liyuan Liu and Aston Zhang and Jiawei
                  Han",
  title        = "Expert Finding in Heterogeneous Bibliographic Networks with
                  Locally-trained Embeddings",
  journal      = "CoRR",
  volume       = "abs/1803.03370",
  year         = 2018,
  url          = "http://arxiv.org/abs/1803.03370",
  archivePrefix= "arXiv",
  eprint       = "1803.03370",
  timestamp    = "Mon, 13 Aug 2018 16:48:03 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1803-03370",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{sun-2013-minin-heter-infor-networ,
  author       = "Sun, Yizhou and Han, Jiawei",
  title        = "Mining Heterogeneous Information Networks: a Structural
                  Analysis Approach",
  journal      = "Acm Sigkdd Explorations Newsletter",
  volume       = 14,
  number       = 2,
  pages        = "20-28",
  year         = 2013,
  publisher    = "ACM"
}

@inproceedings{sui-2019-cgn,
  title        = "Leverage Lexical Knowledge for {C}hinese Named Entity
                  Recognition via Collaborative Graph Network",
  author       = "Sui, Dianbo and Chen, Yubo and Liu, Kang and Zhao, Jun and
                  Liu, Shengping",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1396",
  doi          = "10.18653/v1/D19-1396",
  pages        = "3828-3838",
  abstract     = "The lack of word boundaries information has been seen as one
                  of the main obstacles to develop a high performance Chinese
                  named entity recognition (NER) system. Fortunately, the
                  automatically constructed lexicon contains rich word
                  boundaries information and word semantic
                  information. However, integrating lexical knowledge in
                  Chinese NER tasks still faces challenges when it comes to
                  self-matched lexical words as well as the nearest contextual
                  lexical words. We present a Collaborative Graph Network to
                  solve these challenges. Experiments on various datasets show
                  that our model not only outperforms the state-of-the-art
                  (SOTA) results, but also achieves a speed that is six to
                  fifteen times faster than that of the SOTA model."
}

@ARTICLE{2017arXiv171010903V,
  author       = "{Veli{\v{c}}kovi{\'c}}, Petar and {Cucurull}, Guillem and
         {Casanova}, Arantxa and {Romero}, Adriana and {Li{\`o}}, Pietro and
         {Bengio}, Yoshua",
  title        = "{Graph Attention Networks}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
  Intelligence, Computer Science - Machine Learning, Computer Science - Social
  and Information Networks",
  year         = 2017,
  month        = "Oct",
  eid          = "arXiv:1710.10903",
  pages        = "arXiv:1710.10903",
  archivePrefix= "arXiv",
  eprint       = "1710.10903",
  primaryClass = "stat.ML",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2017arXiv171010903V",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{kipf-2016-gcn,
  author       = "{Kipf}, Thomas N. and {Welling}, Max",
  title        = "{Semi-Supervised Classification with Graph Convolutional
                  Networks}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Statistics - Machine
                  Learning",
  year         = 2016,
  month        = "Sep",
  eid          = "arXiv:1609.02907",
  pages        = "arXiv:1609.02907",
  archivePrefix= "arXiv",
  eprint       = "1609.02907",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160902907K",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{niepert-2016-gcn,
  title        = "Learning convolutional neural networks for graphs",
  author       = "Niepert, Mathias and Ahmed, Mohamed and Kutzkov, Konstantin",
  booktitle    = "International conference on machine learning",
  pages        = "2014-2023",
  year         = 2016
}

@ARTICLE{velickovic-2017-gat,
  author       = "{Veli{\v{c}}kovi{\'c}}, Petar and {Cucurull}, Guillem and
                  {Casanova}, Arantxa and {Romero}, Adriana and {Li{\`o}},
                  Pietro and {Bengio}, Yoshua",
  title        = "{Graph Attention Networks}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Artificial
                  Intelligence, Computer Science - Machine Learning, Computer
                  Science - Social and Information Networks",
  year         = 2017,
  month        = "Oct",
  eid          = "arXiv:1710.10903",
  pages        = "arXiv:1710.10903",
  archivePrefix= "arXiv",
  eprint       = "1710.10903",
  primaryClass = "stat.ML",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2017arXiv171010903V",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{xue-2019-bert-joint,
  author       = "{Xue}, Kui and {Zhou}, Yangming and {Ma}, Zhiyuan and {Ruan},
                  Tong and {Zhang}, Huanhuan and {He}, Ping",
  title        = "{Fine-tuning BERT for Joint Entity and Relation Extraction in
                  Chinese Medical Text}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2019,
  month        = "Aug",
  eid          = "arXiv:1908.07721",
  pages        = "arXiv:1908.07721",
  archivePrefix= "arXiv",
  eprint       = "1908.07721",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190807721X",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{jia-2019-doc-level,
  author       = "Robin Jia and Cliff Wong and Hoifung Poon",
  title        = "Document-Level N-ary Relation Extraction with Multiscale
                  Representation Learning",
  journal      = "CoRR",
  volume       = "abs/1904.02347",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.02347",
  archivePrefix= "arXiv",
  eprint       = "1904.02347",
  timestamp    = "Wed, 24 Apr 2019 12:21:25 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1904-02347",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{gupta-2019-nested-ner,
  title        = "Linguistically Informed Relation Extraction and Neural
                  Architectures for Nested Named Entity Recognition in
                  {B}io{NLP}-{OST} 2019",
  author       = "Gupta, Pankaj and Yaseen, Usama and Sch{\"u}tze, Hinrich",
  booktitle    = "Proceedings of The 5th Workshop on BioNLP Open Shared Tasks",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-5720",
  doi          = "10.18653/v1/D19-5720",
  pages        = "132-142",
  abstract     = "Named Entity Recognition (NER) and Relation Extraction (RE)
                  are essential tools in distilling knowledge from biomedical
                  literature. This paper presents our findings from
                  participating in BioNLP Shared Tasks 2019. We addressed Named
                  Entity Recognition including nested entities extraction,
                  Entity Normalization and Relation Extraction. Our proposed
                  approach of Named Entities can be generalized to different
                  languages and we have shown it{'}s effectiveness for English
                  and Spanish text. We investigated linguistic features, hybrid
                  loss including ranking and Conditional Random Fields (CRF),
                  multi-task objective and token level ensembling strategy to
                  improve NER. We employed dictionary based fuzzy and semantic
                  search to perform Entity Normalization. Finally, our RE
                  system employed Support Vector Machine (SVM) with linguistic
                  features. Our NER submission (team:MIC-CIS) ranked first in
                  BB-2019 norm+NER task with standard error rate (SER) of
                  0.7159 and showed competitive performance on PharmaCo NER
                  task with F1-score of 0.8662. Our RE system ranked first in
                  the SeeDev-binary Relation Extraction Task with F1-score of
                  0.3738."
}

@inproceedings{guo-2019-aggcn,
  title        = "Attention Guided Graph Convolutional Networks for Relation
                  Extraction",
  author       = "Guo, Zhijiang and Zhang, Yan and Lu, Wei",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1024",
  doi          = "10.18653/v1/P19-1024",
  pages        = "241-251",
  abstract     = "Dependency trees convey rich structural information that is
                  proven useful for extracting relations among entities in
                  text. However, how to effectively make use of relevant
                  information while ignoring irrelevant information from the
                  dependency trees remains a challenging research
                  question. Existing approaches employing rule based
                  hard-pruning strategies for selecting relevant partial
                  dependency structures may not always yield optimal
                  results. In this work, we propose Attention Guided Graph
                  Convolutional Networks (AGGCNs), a novel model which directly
                  takes full dependency trees as inputs. Our model can be
                  understood as a soft-pruning approach that automatically
                  learns how to selectively attend to the relevant
                  sub-structures useful for the relation extraction
                  task. Extensive results on various tasks including
                  cross-sentence n-ary relation extraction and large-scale
                  sentence-level relation extraction show that our model is
                  able to better leverage the structural information of the
                  full dependency trees, giving significantly better results
                  than previous approaches."
}

@ARTICLE{he-2019-nre-pul,
  author       = "{He}, Zhengqiu and {Chen}, Wenliang and {Wang}, Yuyi and
                  {zhang}, Wei and {Wang}, Guanchun and {Zhang}, Min",
  title        = "{Improving Neural Relation Extraction with Positive and
                  Unlabeled Learning}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2019,
  month        = "Nov",
  eid          = "arXiv:1911.12556",
  pages        = "arXiv:1911.12556",
  archivePrefix= "arXiv",
  eprint       = "1911.12556",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv191112556H",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{nayak-2019-nre-am,
  title        = "Effective Attention Modeling for Neural Relation Extraction",
  author       = "Nayak, Tapas and Ng, Hwee Tou",
  booktitle    = "Proceedings of the 23rd Conference on Computational Natural
                  Language Learning (CoNLL)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/K19-1056",
  doi          = "10.18653/v1/K19-1056",
  pages        = "603-612",
  abstract     = "Relation extraction is the task of determining the relation
                  between two entities in a sentence. Distantly-supervised
                  models are popular for this task. However, sentences can be
                  long and two entities can be located far from each other in a
                  sentence. The pieces of evidence supporting the presence of a
                  relation between two entities may not be very direct, since
                  the entities may be connected via some indirect links such as
                  a third entity or via co-reference. Relation extraction in
                  such scenarios becomes more challenging as we need to capture
                  the long-distance interactions among the entities and other
                  words in the sentence. Also, the words in a sentence do not
                  contribute equally in identifying the relation between the
                  two entities. To address this issue, we propose a novel and
                  effective attention model which incorporates syntactic
                  information of the sentence and a multi-factor attention
                  mechanism. Experiments on the New York Times corpus show that
                  our proposed model outperforms prior state-of-the-art
                  models."
}

@ARTICLE{shang-2019-noisy-dsre,
  author       = "{Shang}, Yuming",
  title        = "{Are Noisy Sentences Useless for Distant Supervised Relation
                  Extraction?}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Machine Learning",
  year         = 2019,
  month        = "Nov",
  eid          = "arXiv:1911.09788",
  pages        = "arXiv:1911.09788",
  archivePrefix= "arXiv",
  eprint       = "1911.09788",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv191109788S",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{tran-2019-nml,
  author       = "Tung Tran and Ramakanth Kavuluru",
  title        = "Neural Metric Learning for Fast End-to-End Relation
                  Extraction",
  journal      = "CoRR",
  volume       = "abs/1905.07458",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.07458",
  archivePrefix= "arXiv",
  eprint       = "1905.07458",
  timestamp    = "Wed, 28 Aug 2019 07:29:35 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1905-07458",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{amos-2017-optnet,
  author       = "Brandon Amos and J. Zico Kolter",
  title        = "OptNet: Differentiable Optimization as a Layer in Neural
                  Networks",
  journal      = "CoRR",
  volume       = "abs/1703.00443",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.00443",
  archivePrefix= "arXiv",
  eprint       = "1703.00443",
  timestamp    = "Mon, 13 Aug 2018 16:48:26 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/AmosK17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{杨锦锋-2016-中文电子病历命名实体和实体关系语料库构建,
  title        = "中文电子病历命名实体和实体关系语料库构建",
  author       = "杨锦锋 and 关毅 and 何彬 and 曲春燕 and 于秋滨 and 刘雅欣 and
                  赵永杰",
  journal      = "软件学报",
  number       = 11,
  pages        = "2725-2746",
  year         = 2016
}

@inproceedings{kuru-2016-charner,
  title        = "{C}har{NER}: Character-Level Named Entity Recognition",
  author       = "Kuru, Onur and Can, Ozan Arkan and Yuret, Deniz",
  booktitle    = "Proceedings of {COLING} 2016, the 26th International
                  Conference on Computational Linguistics: Technical Papers",
  month        = dec,
  year         = 2016,
  address      = "Osaka, Japan",
  publisher    = "The COLING 2016 Organizing Committee",
  url          = "https://www.aclweb.org/anthology/C16-1087",
  pages        = "911-921",
  abstract     = "We describe and evaluate a character-level tagger for
                  language-independent Named Entity Recognition (NER). Instead
                  of words, a sentence is represented as a sequence of
                  characters. The model consists of stacked bidirectional LSTMs
                  which inputs characters and outputs tag probabilities for
                  each character. These probabilities are then converted to
                  consistent word level named entity tags using a Viterbi
                  decoder. We are able to achieve close to state-of-the-art NER
                  performance in seven languages with the same basic model
                  using only labeled NER data and no hand-engineered features
                  or other external resources like syntactic taggers or
                  Gazetteers."
}

@ARTICLE{ma-2016-lstm-cnn-crf,
  author       = "{Ma}, Xuezhe and {Hovy}, Eduard",
  title        = "{End-to-end Sequence Labeling via Bi-directional
                  LSTM-CNNs-CRF}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Computation and Language, Statistics - Machine Learning",
  year         = 2016,
  month        = "Mar",
  eid          = "arXiv:1603.01354",
  pages        = "arXiv:1603.01354",
  archivePrefix= "arXiv",
  eprint       = "1603.01354",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160301354M",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{yang-2017-neural-reranking,
  author       = "Jie Yang and Yue Zhang and Fei Dong",
  title        = "Neural Reranking for Named Entity Recognition",
  journal      = "CoRR",
  volume       = "abs/1707.05127",
  year         = 2017,
  url          = "http://arxiv.org/abs/1707.05127",
  archivePrefix= "arXiv",
  eprint       = "1707.05127",
  timestamp    = "Wed, 20 Nov 2019 08:54:08 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/YangZD17aa",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{li-2017-ner-recursive-nn,
  title        = "Leveraging Linguistic Structures for Named Entity Recognition
                  with Bidirectional Recursive Neural Networks",
  author       = "Li, Peng-Hsuan and Dong, Ruo-Ping and Wang, Yu-Siang and
                  Chou, Ju-Chieh and Ma, Wei-Yun",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1282",
  doi          = "10.18653/v1/D17-1282",
  pages        = "2664-2669",
  abstract     = "In this paper, we utilize the linguistic structures of texts
                  to improve named entity recognition by BRNN-CNN, a special
                  bidirectional recursive network attached with a convolutional
                  network. Motivated by the observation that named entities are
                  highly related to linguistic constituents, we propose a
                  constituent-based BRNN-CNN for named entity recognition. In
                  contrast to classical sequential labeling methods, the system
                  first identifies which text chunks are possible named
                  entities by whether they are linguistic constituents. Then it
                  classifies these chunks with a constituency tree structure by
                  recursively propagating syntactic and semantic information to
                  each constituent node. This method surpasses current
                  state-of-the-art on OntoNotes 5.0 with automatically
                  generated parses."
}

@inproceedings{tran-2017-stack-residual-lstm,
  title        = "Named Entity Recognition with Stack Residual {LSTM} and
                  Trainable Bias Decoding",
  author       = "Tran, Quan and MacKinlay, Andrew and Jimeno Yepes, Antonio",
  booktitle    = "Proceedings of the Eighth International Joint Conference on
                  Natural Language Processing (Volume 1: Long Papers)",
  month        = nov,
  year         = 2017,
  address      = "Taipei, Taiwan",
  publisher    = "Asian Federation of Natural Language Processing",
  url          = "https://www.aclweb.org/anthology/I17-1057",
  pages        = "566-575",
  abstract     = "Recurrent Neural Network models are the state-of-the-art for
                  Named Entity Recognition (NER). We present two innovations to
                  improve the performance of these models. The first innovation
                  is the introduction of residual connections between the
                  Stacked Recurrent Neural Network model to address the
                  degradation problem of deep neural networks. The second
                  innovation is a bias decoding mechanism that allows the
                  trained system to adapt to non-differentiable and externally
                  computed objectives, such as the entity-based F-measure. Our
                  work improves the state-of-the-art results for both Spanish
                  and English languages on the standard train/development/test
                  split of the CoNLL 2003 Shared Task NER dataset."
}

@article{wei-2016-disease-ner,
  title        = "Disease named entity recognition by combining conditional
                  random fields and bidirectional recurrent neural networks",
  author       = "Wei, Qikang and Chen, Tao and Xu, Ruifeng and He, Yulan and
                  Gui, Lin",
  journal      = "Database",
  volume       = 2016,
  year         = 2016,
  publisher    = "Oxford University Press"
}

@inproceedings{strubell-2017-id-cnn,
  title        = "Fast and Accurate Entity Recognition with Iterated Dilated
                  Convolutions",
  author       = "Strubell, Emma and Verga, Patrick and Belanger, David and
                  McCallum, Andrew",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1283",
  doi          = "10.18653/v1/D17-1283",
  pages        = "2670-2680",
  abstract     = "Today when many practitioners run basic NLP on the entire web
                  and large-volume traffic, faster methods are paramount to
                  saving time and energy costs. Recent advances in GPU hardware
                  have led to the emergence of bi-directional LSTMs as a
                  standard method for obtaining per-token vector
                  representations serving as input to labeling tasks such as
                  NER (often followed by prediction in a linear-chain
                  CRF). Though expressive and accurate, these models fail to
                  fully exploit GPU parallelism, limiting their computational
                  efficiency. This paper proposes a faster alternative to
                  Bi-LSTMs for NER: Iterated Dilated Convolutional Neural
                  Networks (ID-CNNs), which have better capacity than
                  traditional CNNs for large context and structured
                  prediction. Unlike LSTMs whose sequential processing on
                  sentences of length N requires O(N) time even in the face of
                  parallelism, ID-CNNs permit fixed-depth convolutions to run
                  in parallel across entire documents. We describe a distinct
                  combination of network structure, parameter sharing and
                  training procedures that enable dramatic 14-20x test-time
                  speedups while retaining accuracy comparable to the
                  Bi-LSTM-CRF. Moreover, ID-CNNs trained to aggregate context
                  from the entire document are more accurate than Bi-LSTM-CRFs
                  while attaining 8x faster test time speeds."
}

@inproceedings{lin-2017-multi-channel-bi-lstm-crf,
  title        = "Multi-channel {B}i{LSTM}-{CRF} Model for Emerging Named
                  Entity Recognition in Social Media",
  author       = "Lin, Bill Y.  and Xu, Frank and Luo, Zhiyi and Zhu, Kenny",
  booktitle    = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-4421",
  doi          = "10.18653/v1/W17-4421",
  pages        = "160-165",
  abstract     = "In this paper, we present our multi-channel neural
                  architecture for recognizing emerging named entity in social
                  media messages, which we applied in the Novel and Emerging
                  Named Entity Recognition shared task at the EMNLP 2017
                  Workshop on Noisy User-generated Text (W-NUT). We propose a
                  novel approach, which incorporates comprehensive word
                  representations with multi-channel information and
                  Conditional Random Fields (CRF) into a traditional
                  Bidirectional Long Short-Term Memory (BiLSTM) neural network
                  without using any additional hand-craft features such as
                  gazetteers. In comparison with other systems participating in
                  the shared task, our system won the 2nd place."
}

@inproceedings{ghaddar-2018-robust-lexical-features,
  title        = "Robust Lexical Features for Improved Neural Network
                  Named-Entity Recognition",
  author       = "Ghaddar, Abbas and Langlais, Phillippe",
  booktitle    = "Proceedings of the 27th International Conference on
                  Computational Linguistics",
  month        = aug,
  year         = 2018,
  address      = "Santa Fe, New Mexico, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/C18-1161",
  pages        = "1896-1907",
  abstract     = "Neural network approaches to Named-Entity Recognition reduce
                  the need for carefully hand-crafted features. While some
                  features do remain in state-of-the-art systems, lexical
                  features have been mostly discarded, with the exception of
                  gazetteers. In this work, we show that this is unfair:
                  lexical features are actually quite useful. We propose to
                  embed words and entity types into a low-dimensional vector
                  space we train from annotated data produced by distant
                  supervision thanks to Wikipedia. From this, we compute {---}
                  offline {---} a feature vector representing each word. When
                  used with a vanilla recurrent neural network model, this
                  representation yields substantial improvements. We establish
                  a new state-of-the-art F1 score of 87.95 on ONTONOTES 5.0,
                  while matching state-of-the-art performance with a F1 score
                  of 91.73 on the over-studied CONLL-2003 dataset."
}

@article{wu-2015-clinical-text-ner,
  title        = "Named entity recognition in Chinese clinical text using deep
                  neural network",
  author       = "Wu, Yonghui and Jiang, Min and Lei, Jianbo and Xu, Hua",
  journal      = "Studies in health technology and informatics",
  volume       = 216,
  pages        = 624,
  year         = 2015,
  publisher    = "NIH Public Access"
}

@incollection{zhou-2017-joint-extraction,
  title        = "Joint extraction of multiple relations and entities by using
                  a hybrid neural network",
  author       = "Zhou, Peng and Zheng, Suncong and Xu, Jiaming and Qi, Zhenyu
                  and Bao, Hongyun and Xu, Bo",
  booktitle    = "Chinese Computational Linguistics and Natural Language
                  Processing Based on Naturally Annotated Big Data",
  pages        = "135-146",
  year         = 2017,
  publisher    = "Springer"
}

@article{nguyen-2016-mention-detection-rnn,
  author       = "Thien Huu Nguyen and Avirup Sil and Georgiana Dinu and Radu
                  Florian",
  title        = "Toward Mention Detection Robustness with Recurrent Neural
                  Networks",
  journal      = "CoRR",
  volume       = "abs/1602.07749",
  year         = 2016,
  url          = "http://arxiv.org/abs/1602.07749",
  archivePrefix= "arXiv",
  eprint       = "1602.07749",
  timestamp    = "Mon, 13 Aug 2018 16:48:51 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/NguyenSDF16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{zhai-2017-sequence-chunking,
  author       = "Feifei Zhai and Saloni Potdar and Bing Xiang and Bowen Zhou",
  title        = "Neural Models for Sequence Chunking",
  journal      = "CoRR",
  volume       = "abs/1701.04027",
  year         = 2017,
  url          = "http://arxiv.org/abs/1701.04027",
  archivePrefix= "arXiv",
  eprint       = "1701.04027",
  timestamp    = "Mon, 13 Aug 2018 16:48:01 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ZhaiPXZ17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{zukov-gregoric-2018-ner-parallel-rnn,
  title        = "Named Entity Recognition With Parallel Recurrent Neural
                  Networks",
  author       = "{\v{Z}}ukov-Gregori{\v{c}}, Andrej and Bachrach, Yoram and
                  Coope, Sam",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 2: Short Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-2012",
  doi          = "10.18653/v1/P18-2012",
  pages        = "69-74",
  abstract     = "We present a new architecture for named entity
                  recognition. Our model employs multiple independent
                  bidirectional LSTM units across the same input and promotes
                  diversity among them by employing an inter-model
                  regularization term. By distributing computation across
                  multiple smaller LSTMs we find a significant reduction in the
                  total number of parameters. We find our architecture achieves
                  state-of-the-art performance on the CoNLL 2003 NER dataset."
}

@inproceedings{rei-2017-semi-supervised-multitask,
  title        = "Semi-supervised Multitask Learning for Sequence Labeling",
  author       = "Rei, Marek",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2017,
  address      = "Vancouver, Canada",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P17-1194",
  doi          = "10.18653/v1/P17-1194",
  pages        = "2121-2130",
  abstract     = "We propose a sequence labeling framework with a secondary
                  training objective, learning to predict surrounding words for
                  every word in the dataset. This language modeling objective
                  incentivises the system to learn general-purpose patterns of
                  semantic and syntactic composition, which are also useful for
                  improving accuracy on different sequence labeling tasks. The
                  architecture was evaluated on a range of datasets, covering
                  the tasks of error detection in learner texts, named entity
                  recognition, chunking and POS-tagging. The novel language
                  modeling objective provided consistent performance
                  improvements on every benchmark, without requiring any
                  additional annotated or unannotated data."
}

@inproceedings{zhuo-2016-gated-recursive-semi-markov-crf,
  title        = "Segment-Level Sequence Modeling using Gated Recursive
                  Semi-{M}arkov Conditional Random Fields",
  author       = "Zhuo, Jingwei and Cao, Yong and Zhu, Jun and Zhang, Bo and
                  Nie, Zaiqing",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = aug,
  year         = 2016,
  address      = "Berlin, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P16-1134",
  doi          = "10.18653/v1/P16-1134",
  pages        = "1413-1423"
}

@inproceedings{ye-2018-hybrid-markov-crf,
  title        = "Hybrid semi-{M}arkov {CRF} for Neural Sequence Labeling",
  author       = "Ye, Zhixiu and Ling, Zhen-Hua",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 2: Short Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-2038",
  doi          = "10.18653/v1/P18-2038",
  pages        = "235-240",
  abstract     = "This paper proposes hybrid semi-Markov conditional random
                  fields (SCRFs) for neural sequence labeling in natural
                  language processing. Based on conventional conditional random
                  fields (CRFs), SCRFs have been designed for the tasks of
                  assigning labels to segments by extracting features from and
                  describing transitions between segments instead of words. In
                  this paper, we improve the existing SCRF methods by employing
                  word-level and segment-level information
                  simultaneously. First, word-level labels are utilized to
                  derive the segment scores in SCRFs. Second, a CRF output
                  layer and an SCRF output layer are integrated into a unified
                  neural network and trained jointly. Experimental results on
                  CoNLL 2003 named entity recognition (NER) shared task show
                  that our model achieves state-of-the-art performance when no
                  external knowledge is used."
}

@inproceedings{aguilar-2017-multi-task-ner,
  title        = "A Multi-task Approach for Named Entity Recognition in Social
                  Media Data",
  author       = "Aguilar, Gustavo and Maharjan, Suraj and L{\'o}pez-Monroy,
                  Adrian Pastor and Solorio, Thamar",
  booktitle    = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-4419",
  doi          = "10.18653/v1/W17-4419",
  pages        = "148-153",
  abstract     = "Named Entity Recognition for social media data is challenging
                  because of its inherent noisiness. In addition to improper
                  grammatical structures, it contains spelling inconsistencies
                  and numerous informal abbreviations. We propose a novel
                  multi-task approach by employing a more general secondary
                  task of Named Entity (NE) segmentation together with the
                  primary task of fine-grained NE categorization. The
                  multi-task neural network architecture learns higher order
                  feature representations from word and character sequences
                  along with basic Part-of-Speech tags and gazetteer
                  information. This neural network acts as a feature extractor
                  to feed a Conditional Random Fields classifier. We were able
                  to obtain the first position in the 3rd Workshop on Noisy
                  User-generated Text (WNUT-2017) with a 41.86{\%} entity
                  F1-score and a 40.24{\%} surface F1-score."
}

@inproceedings{peng-2017-multi-task-sequence-tagging,
  title        = "Multi-task Domain Adaptation for Sequence Tagging",
  author       = "Peng, Nanyun and Dredze, Mark",
  booktitle    = "Proceedings of the 2nd Workshop on Representation Learning
                  for {NLP}",
  month        = aug,
  year         = 2017,
  address      = "Vancouver, Canada",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-2612",
  doi          = "10.18653/v1/W17-2612",
  pages        = "91-100",
  abstract     = "Many domain adaptation approaches rely on learning cross
                  domain shared representations to transfer the knowledge
                  learned in one domain to other domains. Traditional domain
                  adaptation only considers adapting for one task. In this
                  paper, we explore multi-task representation learning under
                  the domain adaptation scenario. We propose a neural network
                  framework that supports domain adaptation for multiple tasks
                  simultaneously, and learns shared representations that better
                  generalize for domain adaptation. We apply the proposed
                  framework to domain adaptation for sequence tagging problems
                  considering two tasks: Chinese word segmentation and named
                  entity recognition. Experiments show that multi-task domain
                  adaptation works better than disjoint domain adaptation for
                  each task, and achieves the state-of-the-art results for both
                  tasks in the social media domain."
}

@article{pan-2013-transfer-joint-embedding,
  title        = "Transfer joint embedding for cross-domain named entity
                  recognition",
  author       = "Pan, Sinno Jialin and Toh, Zhiqiang and Su, Jian",
  journal      = "ACM Transactions on Information Systems (TOIS)",
  volume       = 31,
  number       = 2,
  pages        = 7,
  year         = 2013,
  publisher    = "ACM"
}

@inproceedings{qu-2016-ner-transfer-learning,
  title        = "Named Entity Recognition for Novel Types by Transfer
                  Learning",
  author       = "Qu, Lizhen and Ferraro, Gabriela and Zhou, Liyuan and Hou,
                  Weiwei and Baldwin, Timothy",
  booktitle    = "Proceedings of the 2016 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = nov,
  year         = 2016,
  address      = "Austin, Texas",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D16-1087",
  doi          = "10.18653/v1/D16-1087",
  pages        = "899-905"
}

@article{yang-2017-transfer-learning-hierachical-rnn,
  author       = "Zhilin Yang and Ruslan Salakhutdinov and William W. Cohen",
  title        = "Transfer Learning for Sequence Tagging with Hierarchical
                  Recurrent Networks",
  journal      = "CoRR",
  volume       = "abs/1703.06345",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.06345",
  archivePrefix= "arXiv",
  eprint       = "1703.06345",
  timestamp    = "Mon, 13 Aug 2018 16:48:14 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/YangSC17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{daeniken-2017-transfer-learning-ner,
  title        = "Transfer Learning and Sentence Level Features for Named
                  Entity Recognition on Tweets",
  author       = "von D{\"a}niken, Pius and Cieliebak, Mark",
  booktitle    = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-4422",
  doi          = "10.18653/v1/W17-4422",
  pages        = "166-171",
  abstract     = "We present our system for the WNUT 2017 Named Entity
                  Recognition challenge on Twitter data. We describe two
                  modifications of a basic neural network architecture for
                  sequence tagging. First, we show how we exploit additional
                  labeled data, where the Named Entity tags differ from the
                  target task. Then, we propose a way to incorporate sentence
                  level features. Our system uses both methods and ranked
                  second for entity level annotations, achieving an F1-score of
                  40.78, and second for surface form annotations, achieving an
                  F1-score of 39.33."
}

@inproceedings{zhao-2018-multi-task-data-selection,
  title        = "Improve Neural Entity Recognition via Multi-Task Data
                  Selection and Constrained Decoding",
  author       = "Zhao, Huasha and Yang, Yi and Zhang, Qiong and Si, Luo",
  booktitle    = "Proceedings of the 2018 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 2 (Short Papers)",
  month        = jun,
  year         = 2018,
  address      = "New Orleans, Louisiana",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N18-2056",
  doi          = "10.18653/v1/N18-2056",
  pages        = "346-351",
  abstract     = "Entity recognition is a widely benchmarked task in natural
                  language processing due to its massive applications. The
                  state-of-the-art solution applies a neural architecture named
                  BiLSTM-CRF to model the language sequences. In this paper, we
                  propose an entity recognition system that improves this
                  neural architecture with two novel techniques. The first
                  technique is Multi-Task Data Selection, which ensures the
                  consistency of data distribution and labeling guidelines
                  between source and target datasets. The other one is
                  constrained decoding using knowledge base. The decoder of the
                  model operates at the document level, and leverages global
                  and external information sources to further improve
                  performance. Extensive experiments have been conducted to
                  show the advantages of each technique. Our system achieves
                  state-of-the-art results on the English entity recognition
                  task in KBP 2017 official evaluation, and it also yields very
                  strong results in other languages."
}

@inproceedings{lin-2018-neural-adaptation-layers,
  title        = "Neural Adaptation Layers for Cross-domain Named Entity
                  Recognition",
  author       = "Lin, Bill Yuchen and Lu, Wei",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1226",
  doi          = "10.18653/v1/D18-1226",
  pages        = "2012-2022",
  abstract     = "Recent research efforts have shown that neural architectures
                  can be effective in conventional information extraction tasks
                  such as named entity recognition, yielding state-of-the-art
                  results on standard newswire datasets. However, despite
                  significant resources required for training such models, the
                  performance of a model trained on one domain typically
                  degrades dramatically when applied to a different domain, yet
                  extracting entities from new emerging domains such as social
                  media can be of significant interest. In this paper, we
                  empirically investigate effective methods for conveniently
                  adapting an existing, well-trained neural NER model for a new
                  domain. Unlike existing approaches, we propose lightweight
                  yet effective methods for performing domain adaptation for
                  neural models. Specifically, we introduce adaptation layers
                  on top of existing neural architectures, where no re-training
                  using the source domain data is required. We conduct
                  extensive empirical studies and show that our approach
                  significantly outperforms state-of-the-art methods."
}

@article{shen-2017-deep-active-learning,
  author       = "Yanyao Shen and Hyokun Yun and Zachary C. Lipton and Yakov
                  Kronrod and Animashree Anandkumar",
  title        = "Deep Active Learning for Named Entity Recognition",
  journal      = "CoRR",
  volume       = "abs/1707.05928",
  year         = 2017,
  url          = "http://arxiv.org/abs/1707.05928",
  archivePrefix= "arXiv",
  eprint       = "1707.05928",
  timestamp    = "Mon, 13 Aug 2018 16:47:29 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/ShenYLKA17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{narasimhan-2016-ie-reinforcement-learning,
  author       = "Karthik Narasimhan and Adam Yala and Regina Barzilay",
  title        = "Improving Information Extraction by Acquiring External
                  Evidence with Reinforcement Learning",
  journal      = "CoRR",
  volume       = "abs/1603.07954",
  year         = 2016,
  url          = "http://arxiv.org/abs/1603.07954",
  archivePrefix= "arXiv",
  eprint       = "1603.07954",
  timestamp    = "Mon, 13 Aug 2018 16:48:30 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/NarasimhanYB16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{zhou-2019-datnet,
  title        = "Dual Adversarial Neural Transfer for Low-Resource Named
                  Entity Recognition",
  author       = "Zhou, Joey Tianyi and Zhang, Hao and Jin, Di and Zhu,
                  Hongyuan and Fang, Meng and Goh, Rick Siow Mong and Kwok,
                  Kenneth",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1336",
  doi          = "10.18653/v1/P19-1336",
  pages        = "3461-3471",
  abstract     = "We propose a new neural transfer method termed Dual
                  Adversarial Transfer Network (DATNet) for addressing
                  low-resource Named Entity Recognition (NER). Specifically,
                  two variants of DATNet, i.e., DATNet-F and DATNet-P, are
                  investigated to explore effective feature fusion between high
                  and low resource. To address the noisy and imbalanced
                  training data, we propose a novel Generalized
                  Resource-Adversarial Discriminator (GRAD). Additionally,
                  adversarial training is adopted to boost model
                  generalization. In experiments, we examine the effects of
                  different components in DATNet across domains and languages
                  and show that significant improvement can be obtained
                  especially for low-resource data, without augmenting any
                  additional hand-crafted features and pre-trained language
                  model."
}

@inproceedings{zukov-gregoric-2017-ner-self-attention,
  title        = "Neural named entity recognition using a self-attention
                  mechanism",
  author       = "Zukov-Gregoric, Andrej and Bachrach, Yoram and Minkovsky,
                  Pasha and Coope, Sam and Maksak, Bogdan",
  booktitle    = "2017 IEEE 29th International Conference on Tools with
                  Artificial Intelligence (ICTAI)",
  pages        = "652-656",
  year         = 2017,
  organization = "IEEE"
}

@inproceedings{xu-2018-ner-global-attention,
  title        = "Improving clinical named entity recognition with global
                  neural attention",
  author       = "Xu, Guohai and Wang, Chengyu and He, Xiaofeng",
  booktitle    = "Asia-Pacific Web (APWeb) and Web-Age Information Management
                  (WAIM) Joint International Conference on Web and Big Data",
  pages        = "264-279",
  year         = 2018,
  organization = "Springer"
}

@article{li-2018-survey-nlp,
  author       = "Jing Li and Aixin Sun and Jianglei Han and Chenliang Li",
  title        = "A Survey on Deep Learning for Named Entity Recognition",
  journal      = "CoRR",
  volume       = "abs/1812.09449",
  year         = 2018,
  url          = "http://arxiv.org/abs/1812.09449",
  archivePrefix= "arXiv",
  eprint       = "1812.09449",
  timestamp    = "Mon, 28 Jan 2019 16:41:27 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1812-09449",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{zhu-2019-can-ner,
  title        = "{CAN}-{NER}: {C}onvolutional {A}ttention {N}etwork for
                  {C}hinese {N}amed {E}ntity {R}ecognition",
  author       = "Zhu, Yuying and Wang, Guoxin",
  booktitle    = "Proceedings of the 2019 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long and Short
                  Papers)",
  month        = jun,
  year         = 2019,
  address      = "Minneapolis, Minnesota",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N19-1342",
  doi          = "10.18653/v1/N19-1342",
  pages        = "3384-3393",
  abstract     = "Named entity recognition (NER) in Chinese is essential but
                  difficult because of the lack of natural
                  delimiters. Therefore, Chinese Word Segmentation (CWS) is
                  usually considered as the first step for Chinese
                  NER. However, models based on word-level embeddings and
                  lexicon features often suffer from segmentation errors and
                  out-of-vocabulary (OOV) words. In this paper, we investigate
                  a Convolutional Attention Network called CAN for Chinese NER,
                  which consists of a character-based convolutional neural
                  network (CNN) with local-attention layer and a gated
                  recurrent unit (GRU) with global self-attention layer to
                  capture the information from adjacent characters and sentence
                  contexts. Also, compared to other models, not depending on
                  any external resources like lexicons and employing small size
                  of char embeddings make our model more practical. Extensive
                  experimental results show that our approach outperforms
                  state-of-the-art methods without word embedding and external
                  lexicon resources on different domain datasets including
                  Weibo, MSRA and Chinese Resume NER dataset."
}

@inproceedings{guan-2019-bert-lstm-crf,
  title        = "New Research on Transfer Learning Model of Named Entity
                  Recognition",
  author       = "Guan, Guoliang and Zhu, Min",
  booktitle    = "Journal of Physics: Conference Series",
  volume       = 1267,
  number       = 1,
  pages        = 012017,
  year         = 2019,
  organization = "IOP Publishing"
}

@inproceedings{arkhipov-2019-multilingual-transforms,
  title        = "Tuning multilingual transformers for language-specific named
                  entity recognition",
  author       = "Arkhipov, Mikhail and Trofimova, Maria and Kuratov, Yurii and
                  Sorokin, Alexey",
  booktitle    = "Proceedings of the 7th Workshop on Balto-Slavic Natural
                  Language Processing",
  pages        = "89-93",
  year         = 2019
}

@ARTICLE{zadeh-2019-fmt,
  author       = "{Zadeh}, Amir and {Mao}, Chengfeng and {Shi}, Kelly and
                  {Zhang}, Yiwei and {Liang}, Paul Pu and {Poria}, Soujanya and
                  {Morency}, Louis-Philippe",
  title        = "{Factorized Multimodal Transformer for Multimodal Sequential
                  Learning}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Computation and Language, Statistics - Machine Learning",
  year         = 2019,
  month        = "Nov",
  eid          = "arXiv:1911.09826",
  pages        = "arXiv:1911.09826",
  archivePrefix= "arXiv",
  eprint       = "1911.09826",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv191109826Z",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{liu-2017-lm-lstm-crf,
  author       = "Liyuan Liu and Jingbo Shang and Frank F. Xu and Xiang Ren and
                  Huan Gui and Jian Peng and Jiawei Han",
  title        = "Empower Sequence Labeling with Task-Aware Neural Language
                  Model",
  journal      = "CoRR",
  volume       = "abs/1709.04109",
  year         = 2017,
  url          = "http://arxiv.org/abs/1709.04109",
  archivePrefix= "arXiv",
  eprint       = "1709.04109",
  timestamp    = "Mon, 13 Aug 2018 16:47:53 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1709-04109",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{liu-2018-lm-pruning,
  title        = "Efficient Contextualized Representation: Language Model
                  Pruning for Sequence Labeling",
  author       = "Liu, Liyuan and Ren, Xiang and Shang, Jingbo and Gu, Xiaotao
                  and Peng, Jian and Han, Jiawei",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1153",
  doi          = "10.18653/v1/D18-1153",
  pages        = "1215-1225",
  abstract     = "Many efforts have been made to facilitate natural language
                  processing tasks with pre-trained language models (LMs), and
                  brought significant improvements to various applications. To
                  fully leverage the nearly unlimited corpora and capture
                  linguistic information of multifarious levels, large-size LMs
                  are required; but for a specific task, only parts of these
                  information are useful. Such large-sized LMs, even in the
                  inference stage, may cause heavy computation workloads,
                  making them too time-consuming for large-scale
                  applications. Here we propose to compress bulky LMs while
                  preserving useful information with regard to a specific
                  task. As different layers of the model keep different
                  information, we develop a layer selection method for model
                  pruning using sparsity-inducing regularization. By
                  introducing the dense connectivity, we can detach any layer
                  without affecting others, and stretch shallow and wide LMs to
                  be deep and narrow. In model training, LMs are learned with
                  layer-wise dropouts for better robustness. Experiments on two
                  benchmark datasets demonstrate the effectiveness of our
                  method."
}

@article{liu-2018-non-local-nn,
  author       = "Pengfei Liu and Shuaichen Chang and Xuanjing Huang and Jian
                  Tang and Jackie Chi Kit Cheung",
  title        = "Contextualized Non-local Neural Networks for Sequence
                  Learning",
  journal      = "CoRR",
  volume       = "abs/1811.08600",
  year         = 2018,
  url          = "http://arxiv.org/abs/1811.08600",
  archivePrefix= "arXiv",
  eprint       = "1811.08600",
  timestamp    = "Mon, 26 Nov 2018 12:52:45 +0100",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1811-08600",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{chen-2019-grn,
  author       = "Hui Chen and Zijia Lin and Guiguang Ding and Jianguang Lou
                  and Yusen Zhang and B{\"{o}}rje Karlsson",
  title        = "{GRN:} Gated Relation Network to Enhance Convolutional Neural
                  Network for Named Entity Recognition",
  journal      = "CoRR",
  volume       = "abs/1907.05611",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.05611",
  archivePrefix= "arXiv",
  eprint       = "1907.05611",
  timestamp    = "Thu, 10 Oct 2019 11:51:45 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1907-05611",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{guo-2019-star-transformer,
  author       = "Qipeng Guo and Xipeng Qiu and Pengfei Liu and Yunfan Shao and
                  Xiangyang Xue and Zheng Zhang",
  title        = "Star-Transformer",
  journal      = "CoRR",
  volume       = "abs/1902.09113",
  year         = 2019,
  url          = "http://arxiv.org/abs/1902.09113",
  archivePrefix= "arXiv",
  eprint       = "1902.09113",
  timestamp    = "Tue, 21 May 2019 18:03:39 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1902-09113",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{yan-2019-tener,
  author       = "{Yan}, Hang and {Deng}, Bocao and {Li}, Xiaonan and {Qiu},
                  Xipeng",
  title        = "{TENER: Adapting Transformer Encoder for Named Entity
                  Recognition}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Machine Learning",
  year         = 2019,
  month        = "Nov",
  eid          = "arXiv:1911.04474",
  pages        = "arXiv:1911.04474",
  archivePrefix= "arXiv",
  eprint       = "1911.04474",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv191104474Y",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{xu-2020-cluener,
  author       = "{Xu}, Liang and {tong}, Yu and {Dong}, Qianqian and {Liao},
                  Yixuan and {Yu}, Cong and {Tian}, Yin and {Liu}, Weitang and
                  {Li}, Lu and {Liu}, Caiquan and {Zhang}, Xuanwei",
  title        = "{CLUENER2020: Fine-grained Named Entity Recognition Dataset
                  and Benchmark for Chinese}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Information Retrieval, Computer Science - Machine Learning",
  year         = 2020,
  month        = "Jan",
  eid          = "arXiv:2001.04351",
  pages        = "arXiv:2001.04351",
  archivePrefix= "arXiv",
  eprint       = "2001.04351",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2020arXiv200104351X",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{crichton-2017-multi-task-bio-ner,
  title        = "A neural network multi-task learning approach to biomedical
                  named entity recognition",
  author       = "Crichton, Gamal and Pyysalo, Sampo and Chiu, Billy and
                  Korhonen, Anna",
  journal      = "BMC bioinformatics",
  volume       = 18,
  number       = 1,
  pages        = 368,
  year         = 2017,
  publisher    = "BioMed Central"
}

@ARTICLE{li-2015-ggs-nn,
  author       = "{Li}, Yujia and {Tarlow}, Daniel and {Brockschmidt}, Marc and
                  {Zemel}, Richard",
  title        = "{Gated Graph Sequence Neural Networks}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Artificial Intelligence, Computer Science - Neural and
                  Evolutionary Computing, Statistics - Machine Learning",
  year         = 2015,
  month        = "Nov",
  eid          = "arXiv:1511.05493",
  pages        = "arXiv:1511.05493",
  archivePrefix= "arXiv",
  eprint       = "1511.05493",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2015arXiv151105493L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{battaglia-2018-gcn,
  author       = "{Battaglia}, Peter W. and {Hamrick}, Jessica B. and {Bapst},
                  Victor and {Sanchez-Gonzalez}, Alvaro and {Zambaldi},
                  Vinicius and {Malinowski}, Mateusz and {Tacchetti}, Andrea
                  and {Raposo}, David and {Santoro}, Adam and {Faulkner}, Ryan
                  and {Gulcehre}, Caglar and {Song}, Francis and {Ballard},
                  Andrew and {Gilmer}, Justin and {Dahl}, George and {Vaswani},
                  Ashish and {Allen}, Kelsey and {Nash}, Charles and
                  {Langston}, Victoria and {Dyer}, Chris and {Heess}, Nicolas
                  and {Wierstra}, Daan and {Kohli}, Pushmeet and {Botvinick},
                  Matt and {Vinyals}, Oriol and {Li}, Yujia and {Pascanu},
                  Razvan",
  title        = "{Relational inductive biases, deep learning, and graph
                  networks}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Artificial Intelligence, Statistics - Machine Learning",
  year         = 2018,
  month        = "Jun",
  eid          = "arXiv:1806.01261",
  pages        = "arXiv:1806.01261",
  archivePrefix= "arXiv",
  eprint       = "1806.01261",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2018arXiv180601261B",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{limsopatham-2016-bi-lstm-twitter,
  title        = "Bidirectional {LSTM} for Named Entity Recognition in Twitter
                  Messages",
  author       = "Limsopatham, Nut and Collier, Nigel",
  booktitle    = "Proceedings of the 2nd Workshop on Noisy User-generated Text
                  ({WNUT})",
  month        = dec,
  year         = 2016,
  address      = "Osaka, Japan",
  publisher    = "The COLING 2016 Organizing Committee",
  url          = "https://www.aclweb.org/anthology/W16-3920",
  pages        = "145-152",
  abstract     = "In this paper, we present our approach for named entity
                  recognition in Twitter messages that we used in our
                  participation in the Named Entity Recognition in Twitter
                  shared task at the COLING 2016 Workshop on Noisy
                  User-generated text (WNUT). The main challenge that we aim to
                  tackle in our participation is the short, noisy and
                  colloquial nature of tweets, which makes named entity
                  recognition in Twitter message a challenging task. In
                  particular, we investigate an approach for dealing with this
                  problem by enabling bidirectional long short-term memory
                  (LSTM) to automatically learn orthographic features without
                  requiring feature engineering. In comparison with other
                  systems participating in the shared task, our system achieved
                  the most effective performance on both the {`}segmentation
                  and categorisation{'} and the {`}segmentation only{'}
                  sub-tasks."
}


@incollection{sarawagi-2005-scrf,
  title        = "Semi-Markov Conditional Random Fields for Information
                  Extraction",
  author       = "Sunita Sarawagi and Cohen, William W",
  booktitle    = "Advances in Neural Information Processing Systems 17",
  editor       = "L. K. Saul and Y. Weiss and L. Bottou",
  pages        = "1185-1192",
  year         = 2005,
  publisher    = "MIT Press",
  url          =
  "http://papers.nips.cc/paper/2648-semi-markov-conditional-random-fields-for-information-extraction.pdf"
}

@article{nadeau-2007-survey-ner,
  title        = "A survey of named entity recognition and classification",
  author       = "Nadeau, David and Sekine, Satoshi",
  journal      = "Lingvisticae Investigationes",
  volume       = 30,
  number       = 1,
  pages        = "3-26",
  year         = 2007,
  publisher    = "John Benjamins"
}

@article{夏光辉-2015-基于实体词典与机器学习的基因命名实体识别,
  title        = "基于实体词典与机器学习的基因命名实体识别",
  author       = "夏光辉 and 李军莲 and 阮学平",
  journal      = "医学信息学杂志",
  number       = 12,
  pages        = "54-60",
  year         = 2015
}

@inproceedings{wu-2018-eval-sl-features,
  title        = "Evaluating the Utility of Hand-crafted Features in Sequence
                  Labelling",
  author       = "Wu, Minghao and Liu, Fei and Cohn, Trevor",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1310",
  doi          = "10.18653/v1/D18-1310",
  pages        = "2850-2856",
  abstract     = "Conventional wisdom is that hand-crafted features are
                  redundant for deep learning models, as they already learn
                  adequate representations of text automatically from
                  corpora. In this work, we test this claim by proposing a new
                  method for exploiting handcrafted features as part of a novel
                  hybrid learning approach, incorporating a feature
                  auto-encoder loss component. We evaluate on the task of named
                  entity recognition (NER), where we show that including manual
                  features for part-of-speech, word shapes and gazetteers can
                  improve the performance of a neural CRF model. We obtain a F
                  1 of 91.89 for the CoNLL-2003 English shared task, which
                  significantly outperforms a collection of highly competitive
                  baseline models. We also present an ablation study showing
                  the importance of auto-encoding, over using features as
                  either inputs or outputs alone, and moreover, show including
                  the autoencoder components reduces training requirements to
                  60{\%}, while retaining the same predictive accuracy."
}

@inproceedings{zhang-2018-adapt-co-attention-ner,
  title        = "Adaptive co-attention network for named entity recognition in
                  tweets",
  author       = "Zhang, Qi and Fu, Jinlan and Liu, Xiaoyu and Huang, Xuanjing",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = 2018
}

@inproceedings{greenberg-2018-disjoint-label-sets-ner,
  title        = "Marginal Likelihood Training of {B}i{LSTM}-{CRF} for
                  Biomedical Named Entity Recognition from Disjoint Label Sets",
  author       = "Greenberg, Nathan and Bansal, Trapit and Verga, Patrick and
                  McCallum, Andrew",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1306",
  doi          = "10.18653/v1/D18-1306",
  pages        = "2824-2829",
  abstract     = "Extracting typed entity mentions from text is a fundamental
                  component to language understanding and reasoning. While
                  there exist substantial labeled text datasets for multiple
                  subsets of biomedical entity types{---}such as genes and
                  proteins, or chemicals and diseases{---}it is rare to find
                  large labeled datasets containing labels for all desired
                  entity types together. This paper presents a method for
                  training a single CRF extractor from multiple datasets with
                  disjoint or partially overlapping sets of entity types. Our
                  approach employs marginal likelihood training to insist on
                  labels that are present in the data, while filling in
                  {``}missing labels{''}. This allows us to leverage all the
                  available data within a single model. In experimental results
                  on the Biocreative V CDR (chemicals/diseases), Biocreative VI
                  ChemProt (chemicals/proteins) and MedMentions (19 entity
                  types) datasets, we show that joint training on multiple
                  datasets improves NER F1 over training in isolation, and our
                  methods achieve state-of-the-art results."
}

@inproceedings{cao-2018-adv-ner,
  title        = "Adversarial Transfer Learning for {C}hinese Named Entity
                  Recognition with Self-Attention Mechanism",
  author       = "Cao, Pengfei and Chen, Yubo and Liu, Kang and Zhao, Jun and
                  Liu, Shengping",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1017",
  doi          = "10.18653/v1/D18-1017",
  pages        = "182-192",
  abstract     = "Named entity recognition (NER) is an important task in
                  natural language processing area, which needs to determine
                  entities boundaries and classify them into pre-defined
                  categories. For Chinese NER task, there is only a very small
                  amount of annotated data available. Chinese NER task and
                  Chinese word segmentation (CWS) task have many similar word
                  boundaries. There are also specificities in each
                  task. However, existing methods for Chinese NER either do not
                  exploit word boundary information from CWS or cannot filter
                  the specific information of CWS. In this paper, we propose a
                  novel adversarial transfer learning framework to make full
                  use of task-shared boundaries information and prevent the
                  task-specific features of CWS. Besides, since arbitrary
                  character can provide important cues when predicting entity
                  type, we exploit self-attention to explicitly capture long
                  range dependencies between two tokens. Experimental results
                  on two different widely used datasets show that our proposed
                  model significantly and consistently outperforms other
                  state-of-the-art methods."
}

@inproceedings{yu-2018-char-lm-ner,
  title        = "On the Strength of Character Language Models for Multilingual
                  Named Entity Recognition",
  author       = "Yu, Xiaodong and Mayhew, Stephen and Sammons, Mark and Roth,
                  Dan",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1345",
  doi          = "10.18653/v1/D18-1345",
  pages        = "3073-3077",
  abstract     = "Character-level patterns have been widely used as features in
                  English Named Entity Recognition (NER) systems. However, to
                  date there has been no direct investigation of the inherent
                  differences between name and nonname tokens in text, nor
                  whether this property holds across multiple languages. This
                  paper analyzes the capabilities of corpus-agnostic
                  Character-level Language Models (CLMs) in the binary task of
                  distinguishing name tokens from non-name tokens. We
                  demonstrate that CLMs provide a simple and powerful model for
                  capturing these differences, identifying named entity tokens
                  in a diverse set of languages at close to the performance of
                  full NER systems. Moreover, by adding very simple CLM-based
                  features we can significantly improve the performance of an
                  off-the-shelf NER system for multiple languages."
}

@article{savarese-2016-residual-gates,
  author       = "Pedro H. P. Savarese",
  title        = "Learning Identity Mappings with Residual Gates",
  journal      = "CoRR",
  volume       = "abs/1611.01260",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.01260",
  archivePrefix= "arXiv",
  eprint       = "1611.01260",
  timestamp    = "Mon, 13 Aug 2018 16:48:22 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/Savarese16",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{weiss-2016-survey-transfer-learning,
  title        = "A survey of transfer learning",
  author       = "Weiss, Karl and Khoshgoftaar, Taghi M and Wang, DingDing",
  journal      = "Journal of Big data",
  volume       = 3,
  number       = 1,
  pages        = 9,
  year         = 2016,
  publisher    = "SpringerOpen"
}

@ARTICLE{thulasidasan-2019-dac-loss,
  author       = "{Thulasidasan}, Sunil and {Bhattacharya}, Tanmoy and
                  {Bilmes}, Jeff and {Chennupati}, Gopinath and {Mohd-Yusof},
                  Jamal",
  title        = "{Combating Label Noise in Deep Learning Using Abstention}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Machine
                  Learning",
  year         = 2019,
  month        = may,
  eid          = "arXiv:1905.10964",
  pages        = "arXiv:1905.10964",
  archivePrefix= "arXiv",
  eprint       = "1905.10964",
  primaryClass = "stat.ML",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190510964T",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{lin-2017-focal-loss,
  author       = "Tsung{-}Yi Lin and Priya Goyal and Ross B. Girshick and
                  Kaiming He and Piotr Doll{\'{a}}r",
  title        = "Focal Loss for Dense Object Detection",
  journal      = "CoRR",
  volume       = "abs/1708.02002",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.02002",
  archivePrefix= "arXiv",
  eprint       = "1708.02002",
  timestamp    = "Mon, 13 Aug 2018 16:46:12 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1708-02002.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{shrivastava-2016-ohem,
  author       = "{Shrivastava}, Abhinav and {Gupta}, Abhinav and {Girshick},
                  Ross",
  title        = "{Training Region-based Object Detectors with Online Hard
                  Example Mining}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computer Vision and Pattern Recognition,
                  Computer Science - Machine Learning",
  year         = 2016,
  month        = apr,
  eid          = "arXiv:1604.03540",
  pages        = "arXiv:1604.03540",
  archivePrefix= "arXiv",
  eprint       = "1604.03540",
  primaryClass = "cs.CV",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160403540S",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{zhou-2017-east,
  author       = "Xinyu Zhou and Cong Yao and He Wen and Yuzhi Wang and
                  Shuchang Zhou and Weiran He and Jiajun Liang",
  title        = "{EAST:} An Efficient and Accurate Scene Text Detector",
  journal      = "CoRR",
  volume       = "abs/1704.03155",
  year         = 2017,
  url          = "http://arxiv.org/abs/1704.03155",
  archivePrefix= "arXiv",
  eprint       = "1704.03155",
  timestamp    = "Mon, 13 Aug 2018 16:48:38 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/ZhouYWWZHL17.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{xie-2015-bce-loss,
  author       = "Saining Xie and Zhuowen Tu",
  title        = "Holistically-Nested Edge Detection",
  journal      = "CoRR",
  volume       = "abs/1504.06375",
  year         = 2015,
  url          = "http://arxiv.org/abs/1504.06375",
  archivePrefix= "arXiv",
  eprint       = "1504.06375",
  timestamp    = "Mon, 13 Aug 2018 16:46:00 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/XieT15.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{sokolova-2009-measure-analysis,
  title        = "A systematic analysis of performance measures for
                  classification tasks",
  journal      = "Information Processing \& Management",
  volume       = 45,
  number       = 4,
  pages        = "427-437",
  year         = 2009,
  issn         = "0306-4573",
  doi          = "https://doi.org/10.1016/j.ipm.2009.03.002",
  url          =
  "http://www.sciencedirect.com/science/article/pii/S0306457309000259",
  author       = "Marina Sokolova and Guy Lapalme",
  keywords     = "Performance evaluation, Machine Learning, Text
                  classification",
  abstract     = "This paper presents a systematic analysis of twenty four
                  performance measures used in the complete spectrum of Machine
                  Learning classification tasks, i.e., binary, multi-class,
                  multi-labelled, and hierarchical. For each classification
                  task, the study relates a set of changes in a confusion
                  matrix to specific characteristics of data. Then the analysis
                  concentrates on the type of changes to a confusion matrix
                  that do not change a measure, therefore, preserve a
                  classifier’s evaluation (measure invariance). The result is
                  the measure invariance taxonomy with respect to all relevant
                  label distribution changes in a classification problem. This
                  formal analysis is supported by examples of applications
                  where invariance properties of measures lead to a more
                  reliable evaluation of classifiers. Text classification
                  supplements the discussion with several case studies."
}

@inproceedings{yang-2018-sgm,
  title        = "{SGM}: Sequence Generation Model for Multi-label
                  Classification",
  author       = "Yang, Pengcheng and Sun, Xu and Li, Wei and Ma, Shuming and
                  Wu, Wei and Wang, Houfeng",
  booktitle    = "Proceedings of the 27th International Conference on
                  Computational Linguistics",
  month        = aug,
  year         = 2018,
  address      = "Santa Fe, New Mexico, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/C18-1330",
  pages        = "3915-3926",
  abstract     = "Multi-label classification is an important yet challenging
                  task in natural language processing. It is more complex than
                  single-label classification in that the labels tend to be
                  correlated. Existing methods tend to ignore the correlations
                  between labels. Besides, different parts of the text can
                  contribute differently for predicting different labels, which
                  is not considered by existing models. In this paper, we
                  propose to view the multi-label classification task as a
                  sequence generation problem, and apply a sequence generation
                  model with a novel decoder structure to solve it. Extensive
                  experimental results show that our proposed methods
                  outperform previous work by a substantial margin. Further
                  analysis of experimental results demonstrates that the
                  proposed methods not only capture the correlations between
                  labels, but also select the most informative words
                  automatically when predicting different labels."
}

@article{tay-2018-cafe,
  author       = "Yi Tay and Luu Anh Tuan and Siu Cheung Hui",
  title        = "A Compare-Propagate Architecture with Alignment Factorization
                  for Natural Language Inference",
  journal      = "CoRR",
  volume       = "abs/1801.00102",
  year         = 2018,
  url          = "http://arxiv.org/abs/1801.00102",
  archivePrefix= "arXiv",
  eprint       = "1801.00102",
  timestamp    = "Mon, 13 Aug 2018 16:47:31 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1801-00102.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{lan-2019-albert,
  author       = "{Lan}, Zhenzhong and {Chen}, Mingda and {Goodman}, Sebastian
                  and {Gimpel}, Kevin and {Sharma}, Piyush and {Soricut}, Radu",
  title        = "{ALBERT: A Lite BERT for Self-supervised Learning of Language
                  Representations}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
                  - Artificial Intelligence",
  year         = 2019,
  month        = sep,
  eid          = "arXiv:1909.11942",
  pages        = "arXiv:1909.11942",
  archivePrefix= "arXiv",
  eprint       = "1909.11942",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190911942L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{jiao-2019-tinybert,
  author       = "{Jiao}, Xiaoqi and {Yin}, Yichun and {Shang}, Lifeng and
                  {Jiang}, Xin and {Chen}, Xiao and {Li}, Linlin and {Wang},
                  Fang and {Liu}, Qun",
  title        = "{TinyBERT: Distilling BERT for Natural Language
                  Understanding}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
                  - Artificial Intelligence, Computer Science - Machine
                  Learning",
  year         = 2019,
  month        = sep,
  eid          = "arXiv:1909.10351",
  pages        = "arXiv:1909.10351",
  archivePrefix= "arXiv",
  eprint       = "1909.10351",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190910351J",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{joshi-2019-spanbert,
  author       = "Mandar Joshi and Danqi Chen and Yinhan Liu and Daniel S. Weld
                  and Luke Zettlemoyer and Omer Levy",
  title        = "SpanBERT: Improving Pre-training by Representing and
                  Predicting Spans",
  journal      = "CoRR",
  volume       = "abs/1907.10529",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.10529",
  archivePrefix= "arXiv",
  eprint       = "1907.10529",
  timestamp    = "Thu, 01 Aug 2019 08:59:33 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1907-10529.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{radford-2019-gpt2,
  title        = {Language Models are Unsupervised Multitask Learners},
  author       = {Alec Radford and Jeffrey Wu and Rewon Child and David Luan
                  and Dario Amodei and Ilya Sutskever},
  year         = 2019
}

@article{boutell-2004-binary-relevance,
  title        = "Learning multi-label scene classification",
  journal      = "Pattern Recognition",
  volume       = 37,
  number       = 9,
  pages        = "1757-1771",
  year         = 2004,
  issn         = "0031-3203",
  doi          = "https://doi.org/10.1016/j.patcog.2004.03.009",
  url          =
  "http://www.sciencedirect.com/science/article/pii/S0031320304001074",
  author       = "Matthew R. Boutell and Jiebo Luo and Xipeng Shen and
                  Christopher M. Brown",
  keywords     = "Image understanding, Semantic scene classification,
                  Multi-label classification, Multi-label training, Multi-label
                  evaluation, Image organization, Cross-training, Jaccard
                  similarity",
  abstract     = "In classic pattern recognition problems, classes are mutually
                  exclusive by definition. Classification errors occur when the
                  classes overlap in the feature space. We examine a different
                  situation, occurring when the classes are, by definition, not
                  mutually exclusive. Such problems arise in semantic scene and
                  document classification and in medical diagnosis. We present
                  a framework to handle such problems and apply it to the
                  problem of semantic scene classification, where a natural
                  scene may contain multiple objects such that the scene can be
                  described by multiple class labels (e.g., a field scene with
                  a mountain in the background). Such a problem poses
                  challenges to the classic pattern recognition paradigm and
                  demands a different treatment. We discuss approaches for
                  training and testing in this scenario and introduce new
                  metrics for evaluating individual examples, class recall and
                  precision, and overall accuracy. Experiments show that our
                  methods are suitable for scene classification; furthermore,
                  our work appears to generalize to other classification
                  problems of the same nature."
}

@inproceedings{read-2009-classifier-chains,
  title        = "Classifier chains for multi-label classification",
  author       = "Read, Jesse and Pfahringer, Bernhard and Holmes, Geoff and
                  Frank, Eibe",
  booktitle    = "Joint European Conference on Machine Learning and Knowledge
                  Discovery in Databases",
  pages        = "254-269",
  year         = 2009,
  organization = "Springer"
}

@inproceedings{zhang-2019-ernie,
  title        = "{ERNIE}: Enhanced Language Representation with Informative
                  Entities",
  author       = "Zhang, Zhengyan and Han, Xu and Liu, Zhiyuan and Jiang, Xin
                  and Sun, Maosong and Liu, Qun",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1139",
  doi          = "10.18653/v1/P19-1139",
  pages        = "1441-1451",
  abstract     = "Neural language representation models such as BERT
                  pre-trained on large-scale corpora can well capture rich
                  semantic patterns from plain text, and be fine-tuned to
                  consistently improve the performance of various NLP
                  tasks. However, the existing pre-trained language models
                  rarely consider incorporating knowledge graphs (KGs), which
                  can provide rich structured knowledge facts for better
                  language understanding. We argue that informative entities in
                  KGs can enhance language representation with external
                  knowledge. In this paper, we utilize both large-scale textual
                  corpora and KGs to train an enhanced language representation
                  model (ERNIE), which can take full advantage of lexical,
                  syntactic, and knowledge information simultaneously. The
                  experimental results have demonstrated that ERNIE achieves
                  significant improvements on various knowledge-driven tasks,
                  and meanwhile is comparable with the state-of-the-art model
                  BERT on other common NLP tasks. The code and datasets will be
                  available in the future."
}

@article{sun-2019-ernie2,
  author       = "Yu Sun and Shuohuan Wang and Yu{-}Kun Li and Shikun Feng and
                  Hao Tian and Hua Wu and Haifeng Wang",
  title        = "{ERNIE} 2.0: {A} Continual Pre-training Framework for
                  Language Understanding",
  journal      = "CoRR",
  volume       = "abs/1907.12412",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.12412",
  archivePrefix= "arXiv",
  eprint       = "1907.12412",
  timestamp    = "Tue, 21 Jan 2020 07:56:31 +0100",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1907-12412.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{dong-2019-unilm,
  author       = "Li Dong and Nan Yang and Wenhui Wang and Furu Wei and
                  Xiaodong Liu and Yu Wang and Jianfeng Gao and Ming Zhou and
                  Hsiao{-}Wuen Hon",
  title        = "Unified Language Model Pre-training for Natural Language
                  Understanding and Generation",
  journal      = "CoRR",
  volume       = "abs/1905.03197",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.03197",
  archivePrefix= "arXiv",
  eprint       = "1905.03197",
  timestamp    = "Wed, 19 Feb 2020 17:11:34 +0100",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1905-03197.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{clark-2019-electra,
  title        = "ELECTRA: Pre-training Text Encoders as Discriminators Rather
                  Than Generators",
  author       = "Clark, Kevin and Luong, Minh-Thang and Le, Quoc V and
                  Manning, Christopher D",
  booktitle    = "International Conference on Learning Representations",
  year         = 2019
}

@article{liu-2019-mt-dnn,
  author       = "Xiaodong Liu and Pengcheng He and Weizhu Chen and Jianfeng
                  Gao",
  title        = "Multi-Task Deep Neural Networks for Natural Language
                  Understanding",
  journal      = "CoRR",
  volume       = "abs/1901.11504",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.11504",
  archivePrefix= "arXiv",
  eprint       = "1901.11504",
  timestamp    = "Mon, 04 Feb 2019 08:11:03 +0100",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1901-11504.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{yang-2019-xlnet,
  author       = "Zhilin Yang and Zihang Dai and Yiming Yang and Jaime
                  G. Carbonell and Ruslan Salakhutdinov and Quoc V. Le",
  title        = "XLNet: Generalized Autoregressive Pretraining for Language
                  Understanding",
  journal      = "CoRR",
  volume       = "abs/1906.08237",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.08237",
  archivePrefix= "arXiv",
  eprint       = "1906.08237",
  timestamp    = "Mon, 24 Jun 2019 17:28:45 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1906-08237.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{cui-2019-bert-wwm,
  author       = "{Cui}, Yiming and {Che}, Wanxiang and {Liu}, Ting and {Qin},
                  Bing and {Yang}, Ziqing and {Wang}, Shijin and {Hu}, Guoping",
  title        = "{Pre-Training with Whole Word Masking for Chinese BERT}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
  - Machine Learning",
  year         = 2019,
  month        = jun,
  eid          = "arXiv:1906.08101",
  pages        = "arXiv:1906.08101",
  archivePrefix= "arXiv",
  eprint       = "1906.08101",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190608101C",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{socher-2011-rnn,
  title        = "Parsing natural scenes and natural language with recursive
                  neural networks",
  author       = "Socher, Richard and Lin, Cliff C and Manning, Chris and Ng,
                  Andrew Y",
  booktitle    = "Proceedings of the 28th international conference on machine
                  learning (ICML-11)",
  pages        = "129-136",
  year         = 2011
}

@inproceedings{socher-2013-sentiment-treebank,
  title        = "Recursive Deep Models for Semantic Compositionality Over a
                  Sentiment Treebank",
  author       = "Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang,
                  Jason and Manning, Christopher D.  and Ng, Andrew and Potts,
                  Christopher",
  booktitle    = "Proceedings of the 2013 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct,
  year         = 2013,
  address      = "Seattle, Washington, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D13-1170",
  pages        = "1631-1642"
}

@article{pollack-1990-raam,
  title        = "Recursive distributed representations",
  journal      = "Artificial Intelligence",
  volume       = 46,
  number       = 1,
  pages        = "77-105",
  year         = 1990,
  issn         = "0004-3702",
  doi          = "https://doi.org/10.1016/0004-3702(90)90005-K",
  url          =
  "http://www.sciencedirect.com/science/article/pii/000437029090005K",
  author       = "Jordan B. Pollack",
  abstract     = "A longstanding difficulty for connectionist modeling has been
                  how to represent variable-sized recursive data structures,
                  such as trees and lists, in fixed-width patterns. This paper
                  presents a connectionist architecture which automatically
                  develops compact distributed representations for such
                  compositional structures, as well as efficient accessing
                  mechanisms for them. Patterns which stand for the internal
                  nodes of fixed-valence trees are devised through the
                  recursive use of backpropagation on three-layer
                  auto-associative encoder networks. The resulting
                  representations are novel, in that they combine apparently
                  immiscible aspects of features, pointers, and symbol
                  structures. They form a bridge between the data structures
                  necessary for high-level cognitive tasks and the associative,
                  pattern recognition machinery provided by neural networks."
}

@inproceedings{shen-2018-straight-tree,
  title        = "Straight to the Tree: Constituency Parsing with Neural
                  Syntactic Distance",
  author       = "Shen, Yikang and Lin, Zhouhan and Jacob, Athul Paul and
                  Sordoni, Alessandro and Courville, Aaron and Bengio, Yoshua",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1108",
  doi          = "10.18653/v1/P18-1108",
  pages        = "1171-1180",
  abstract     = "In this work, we propose a novel constituency parsing
                  scheme. The model first predicts a real-valued scalar, named
                  syntactic distance, for each split position in the
                  sentence. The topology of grammar tree is then determined by
                  the values of syntactic distances. Compared to traditional
                  shift-reduce parsing schemes, our approach is free from the
                  potentially disastrous compounding error. It is also easier
                  to parallelize and much faster. Our model achieves the
                  state-of-the-art single model F1 score of 92.1 on PTB and
                  86.4 on CTB dataset, which surpasses the previous single
                  model results by a large margin."
}

@inproceedings{socher-2012-mv-rnn,
  title        = "Semantic Compositionality through Recursive Matrix-Vector
                  Spaces",
  author       = "Socher, Richard and Huval, Brody and Manning, Christopher D.
                  and Ng, Andrew Y.",
  booktitle    = "Proceedings of the 2012 Joint Conference on Empirical Methods
                  in Natural Language Processing and Computational Natural
                  Language Learning",
  month        = jul,
  year         = 2012,
  address      = "Jeju Island, Korea",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D12-1110",
  pages        = "1201-1211"
}

@article{tai-2015-tree-lstm,
  author       = "Kai Sheng Tai and Richard Socher and Christopher D. Manning",
  title        = "Improved Semantic Representations From Tree-Structured Long
                  Short-Term Memory Networks",
  journal      = "CoRR",
  volume       = "abs/1503.00075",
  year         = 2015,
  url          = "http://arxiv.org/abs/1503.00075",
  archivePrefix= "arXiv",
  eprint       = "1503.00075",
  timestamp    = "Mon, 13 Aug 2018 16:48:20 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/TaiSM15.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{zhu-2020-crosswoz,
  author       = "{Zhu}, Qi and {Huang}, Kaili and {Zhang}, Zheng and {Zhu},
                  Xiaoyan and {Huang}, Minlie",
  title        = "{CrossWOZ: A Large-Scale Chinese Cross-Domain Task-Oriented
                  Dialogue Dataset}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2020,
  month        = feb,
  eid          = "arXiv:2002.11893",
  pages        = "arXiv:2002.11893",
  archivePrefix= "arXiv",
  eprint       = "2002.11893",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2020arXiv200211893Z",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{nie-2019-k-multiple-means,
  author       = "Nie, Feiping and Wang, Cheng-Long and Li, Xuelong",
  title        = "K-Multiple-Means: A Multiple-Means Clustering Method with
                  Specified K Clusters",
  year         = 2019,
  isbn         = 9781450362016,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/3292500.3330846",
  doi          = "10.1145/3292500.3330846",
  booktitle    = "Proceedings of the 25th ACM SIGKDD International Conference
                  on Knowledge Discovery \& Data Mining",
  pages        = "959–967",
  numpages     = 9,
  keywords     = "graph laplacian, clustering, K-means, multiple means",
  location     = "Anchorage, AK, USA",
  series       = "KDD ’19"
}

@article{lee-2019-biobert,
  author       = "Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon
                  Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang",
  title        = "BioBERT: a pre-trained biomedical language representation
                  model for biomedical text mining",
  journal      = "CoRR",
  volume       = "abs/1901.08746",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.08746",
  archivePrefix= "arXiv",
  eprint       = "1901.08746",
  timestamp    = "Sat, 02 Feb 2019 16:56:00 +0100",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1901-08746.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{alsentzer-2019-clinical-bert,
  title        = "Publicly Available Clinical {BERT} Embeddings",
  author       = "Alsentzer, Emily and Murphy, John and Boag, William and Weng,
                  Wei-Hung and Jindi, Di and Naumann, Tristan and McDermott,
                  Matthew",
  booktitle    = "Proceedings of the 2nd Clinical Natural Language Processing
                  Workshop",
  month        = jun,
  year         = 2019,
  address      = "Minneapolis, Minnesota, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W19-1909",
  doi          = "10.18653/v1/W19-1909",
  pages        = "72-78",
  abstract     = "Contextual word embedding models such as ELMo and BERT have
                  dramatically improved performance for many natural language
                  processing (NLP) tasks in recent months. However, these
                  models have been minimally explored on specialty corpora,
                  such as clinical text; moreover, in the clinical domain, no
                  publicly-available pre-trained BERT models yet exist. In this
                  work, we address this need by exploring and releasing BERT
                  models for clinical text: one for generic clinical text and
                  another for discharge summaries specifically. We demonstrate
                  that using a domain-specific model yields performance
                  improvements on 3/5 clinical NLP tasks, establishing a new
                  state-of-the-art on the MedNLI dataset. We find that these
                  domain-specific models are not as performant on 2 clinical
                  de-identification tasks, and argue that this is a natural
                  consequence of the differences between de-identified source
                  text and synthetically non de-identified task text."
}

@ARTICLE{shang-2019-g-bert,
  author       = "{Shang}, Junyuan and {Ma}, Tengfei and {Xiao}, Cao and {Sun},
                  Jimeng",
  title        = "{Pre-training of Graph Augmented Transformers for Medication
                  Recommendation}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Artificial Intelligence, Computer Science
                  - Computation and Language, Computer Science - Machine
                  Learning",
  year         = 2019,
  month        = jun,
  eid          = "arXiv:1906.00346",
  pages        = "arXiv:1906.00346",
  archivePrefix= "arXiv",
  eprint       = "1906.00346",
  primaryClass = "cs.AI",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190600346S",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{chevalier-boisvert-2019-babyai,
  title        = "Baby{AI}: First Steps Towards Grounded Language Learning With
                  a Human In the Loop",
  author       = "Maxime Chevalier-Boisvert and Dzmitry Bahdanau and Salem
                  Lahlou and Lucas Willems and Chitwan Saharia and Thien Huu
                  Nguyen and Yoshua Bengio",
  booktitle    = "International Conference on Learning Representations",
  year         = 2019,
  url          = "https://openreview.net/forum?id=rJeXCo0cYX"
}

@article{beltagy-2019-scibert,
  author       = "Iz Beltagy and Arman Cohan and Kyle Lo",
  title        = "SciBERT: Pretrained Contextualized Embeddings for Scientific
                  Text",
  journal      = "CoRR",
  volume       = "abs/1903.10676",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.10676",
  archivePrefix= "arXiv",
  eprint       = "1903.10676",
  timestamp    = "Mon, 01 Apr 2019 14:07:37 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1903-10676.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{pires-2019-m-bert,
  author       = "{Pires}, Telmo and {Schlinger}, Eva and {Garrette}, Dan",
  title        = "{How multilingual is Multilingual BERT?}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
                  - Artificial Intelligence, Computer Science - Machine
                  Learning",
  year         = 2019,
  month        = jun,
  eid          = "arXiv:1906.01502",
  pages        = "arXiv:1906.01502",
  archivePrefix= "arXiv",
  eprint       = "1906.01502",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190601502P",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{lee-2019-patent-bert,
  author       = "{Lee}, Jieh-Sheng and {Hsiang}, Jieh",
  title        = "{PatentBERT: Patent Classification with Fine-Tuning a
                  pre-trained BERT Model}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language, Computer Science
                  - Machine Learning, Statistics - Machine Learning",
  year         = 2019,
  month        = may,
  eid          = "arXiv:1906.02124",
  pages        = "arXiv:1906.02124",
  archivePrefix= "arXiv",
  eprint       = "1906.02124",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190602124L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{adhikari-2019-docbert,
  author       = "Ashutosh Adhikari and Achyudh Ram and Raphael Tang and Jimmy
                  Lin",
  title        = "DocBERT: {BERT} for Document Classification",
  journal      = "CoRR",
  volume       = "abs/1904.08398",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.08398",
  archivePrefix= "arXiv",
  eprint       = "1904.08398",
  timestamp    = "Fri, 26 Apr 2019 13:18:53 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1904-08398.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{coenen-2019-bert-geometry,
  author       = "Andy Coenen and Emily Reif and Ann Yuan and Been Kim and Adam
                  Pearce and Fernanda B. Vi{\'{e}}gas and Martin Wattenberg",
  title        = "Visualizing and Measuring the Geometry of {BERT}",
  journal      = "CoRR",
  volume       = "abs/1906.02715",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.02715",
  archivePrefix= "arXiv",
  eprint       = "1906.02715",
  timestamp    = "Thu, 13 Jun 2019 13:36:00 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1906-02715.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{michel-2019-sixteen-heads,
  author       = "Paul Michel and Omer Levy and Graham Neubig",
  title        = "Are Sixteen Heads Really Better than One?",
  journal      = "CoRR",
  volume       = "abs/1905.10650",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.10650",
  archivePrefix= "arXiv",
  eprint       = "1905.10650",
  timestamp    = "Mon, 03 Jun 2019 13:42:33 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1905-10650.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{zellers-2019-hellaswag,
  author       = "Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali
                  Farhadi and Yejin Choi",
  title        = "HellaSwag: Can a Machine Really Finish Your Sentence?",
  journal      = "CoRR",
  volume       = "abs/1905.07830",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.07830",
  archivePrefix= "arXiv",
  eprint       = "1905.07830",
  timestamp    = "Tue, 28 May 2019 12:48:08 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1905-07830.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{tenney-2019-bert-rediscover,
  author       = "Ian Tenney and Dipanjan Das and Ellie Pavlick",
  title        = "{BERT} Rediscovers the Classical {NLP} Pipeline",
  journal      = "CoRR",
  volume       = "abs/1905.05950",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.05950",
  archivePrefix= "arXiv",
  eprint       = "1905.05950",
  timestamp    = "Tue, 28 May 2019 12:48:08 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1905-05950",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{clark-2019-bert-attention,
  author       = "{Clark}, Kevin and {Khandelwal}, Urvashi and {Levy}, Omer and
                  {Manning}, Christopher D.",
  title        = "{What Does BERT Look At? An Analysis of BERT's Attention}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2019,
  month        = jun,
  eid          = "arXiv:1906.04341",
  pages        = "arXiv:1906.04341",
  archivePrefix= "arXiv",
  eprint       = "1906.04341",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2019arXiv190604341C",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{loshchilov-2016-sgdr,
  author       = "{Loshchilov}, Ilya and {Hutter}, Frank",
  title        = "{SGDR: Stochastic Gradient Descent with Warm Restarts}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Neural and Evolutionary Computing, Mathematics - Optimization
                  and Control",
  year         = 2016,
  month        = aug,
  eid          = "arXiv:1608.03983",
  pages        = "arXiv:1608.03983",
  archivePrefix= "arXiv",
  eprint       = "1608.03983",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160803983L",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{yin-2014-gsdmm,
  author       = "Yin, Jianhua and Wang, Jianyong",
  title        = "A Dirichlet Multinomial Mixture Model-Based Approach for
                  Short Text Clustering",
  year         = 2014,
  isbn         = 9781450329569,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2623330.2623715",
  doi          = "10.1145/2623330.2623715",
  booktitle    = "Proceedings of the 20th ACM SIGKDD International Conference
                  on Knowledge Discovery and Data Mining",
  pages        = "233–242",
  numpages     = 10,
  keywords     = "gibbs sampling, short text clustering, dirichlet multinomial
                  mixture",
  location     = "New York, New York, USA",
  series       = "KDD ’14"
}

@inproceedings{yin-2016-fgsdmm-plus,
  author       = "Yin, Jianhua and Wang, Jianyong",
  title        = "A Text Clustering Algorithm Using an Online Clustering Scheme
                  for Initialization",
  year         = 2016,
  isbn         = 9781450342322,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2939672.2939841",
  doi          = "10.1145/2939672.2939841",
  booktitle    = "Proceedings of the 22nd ACM SIGKDD International Conference
                  on Knowledge Discovery and Data Mining",
  pages        = "1995–2004",
  numpages     = 10,
  keywords     = "dirichlet multinomial mixture, gibbs sampling, text
                  clustering",
  location     = "San Francisco, California, USA",
  series       = "KDD ’16"
}

@article{nigam-2000-text-em,
  title        = "Text classification from labeled and unlabeled documents
                  using EM",
  author       = "Nigam, Kamal and McCallum, Andrew Kachites and Thrun,
                  Sebastian and Mitchell, Tom",
  journal      = "Machine learning",
  volume       = 39,
  number       = "2-3",
  pages        = "103-134",
  year         = 2000,
  publisher    = "Springer"
}

@article{holmes-2012-dmm,
  title        = "Dirichlet multinomial mixtures: generative models for
                  microbial metagenomics",
  author       = "Holmes, Ian and Harris, Keith and Quince, Christopher",
  journal      = "PloS one",
  volume       = 7,
  number       = 2,
  year         = 2012,
  publisher    = "Public Library of Science"
}

@inproceedings{li-2016-gpu-dmm,
  title        = "Topic modeling for short texts with auxiliary word
                  embeddings",
  author       = "Li, Chenliang and Wang, Haoran and Zhang, Zhiqian and Sun,
                  Aixin and Ma, Zongyang",
  booktitle    = "Proceedings of the 39th International ACM SIGIR conference on
                  Research and Development in Information Retrieval",
  pages        = "165-174",
  year         = 2016
}

@inproceedings{rangrej-2011-short-text-clustering-comparison,
  title        = "Comparative study of clustering techniques for short text
                  documents",
  author       = "Rangrej, Aniket and Kulkarni, Sayali and Tendulkar, Ashish V",
  booktitle    = "Proceedings of the 20th international conference companion on
                  World wide web",
  pages        = "111-112",
  year         = 2011
}

@article{pan-2009-transfer-survey,
  title        = "A survey on transfer learning",
  author       = "Pan, Sinno Jialin and Yang, Qiang",
  journal      = "IEEE Transactions on knowledge and data engineering",
  volume       = 22,
  number       = 10,
  pages        = "1345-1359",
  year         = 2009,
  publisher    = "IEEE"
}

@article{li-2012-tl-nlp-survey,
  title        = "Literature survey: domain adaptation algorithms for natural
                  language processing",
  author       = "Li, Qi",
  journal      = "Department of Computer Science The Graduate Center, The City
                  University of New York",
  pages        = "8-10",
  year         = 2012
}

@article{mao-2019-medgcn,
  author       = "Chengsheng Mao and Liang Yao and Yuan Luo",
  title        = "MedGCN: Graph Convolutional Networks for Multiple Medical
                  Tasks",
  journal      = "CoRR",
  volume       = "abs/1904.00326",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.00326",
  archivePrefix= "arXiv",
  eprint       = "1904.00326",
  timestamp    = "Fri, 28 Jun 2019 09:35:46 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1904-00326.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{edwards-2016-neural-statistician,
  author       = "{Edwards}, Harrison and {Storkey}, Amos",
  title        = "{Towards a Neural Statistician}",
  journal      = "arXiv e-prints",
  keywords     = "Statistics - Machine Learning, Computer Science - Machine
                  Learning",
  year         = 2016,
  month        = jun,
  eid          = "arXiv:1606.02185",
  pages        = "arXiv:1606.02185",
  archivePrefix= "arXiv",
  eprint       = "1606.02185",
  primaryClass = "stat.ML",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160602185E",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{finn-2017-maml,
  author       = "{Finn}, Chelsea and {Abbeel}, Pieter and {Levine}, Sergey",
  title        = "{Model-Agnostic Meta-Learning for Fast Adaptation of Deep
                  Networks}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Artificial Intelligence, Computer Science - Computer Vision
                  and Pattern Recognition, Computer Science - Neural and
                  Evolutionary Computing",
  year         = 2017,
  month        = mar,
  eid          = "arXiv:1703.03400",
  pages        = "arXiv:1703.03400",
  archivePrefix= "arXiv",
  eprint       = "1703.03400",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2017arXiv170303400F",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{vinyals-2016-matching-networks,
  author       = "Oriol Vinyals and Charles Blundell and Timothy P. Lillicrap
                  and Koray Kavukcuoglu and Daan Wierstra",
  title        = "Matching Networks for One Shot Learning",
  journal      = "CoRR",
  volume       = "abs/1606.04080",
  year         = 2016,
  url          = "http://arxiv.org/abs/1606.04080",
  archivePrefix= "arXiv",
  eprint       = "1606.04080",
  timestamp    = "Mon, 13 Aug 2018 16:46:48 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/VinyalsBLKW16.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{bhojanapalli-2020-low-rank-bottleneck,
  author       = "{Bhojanapalli}, Srinadh and {Yun}, Chulhee and {Singh Rawat},
                  Ankit and {Reddi}, Sashank J. and {Kumar}, Sanjiv",
  title        = "{Low-Rank Bottleneck in Multi-head Attention Models}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Statistics - Machine
                  Learning",
  year         = 2020,
  month        = feb,
  eid          = "arXiv:2002.07028",
  pages        = "arXiv:2002.07028",
  archivePrefix= "arXiv",
  eprint       = "2002.07028",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2020arXiv200207028B",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@ARTICLE{shazeer-2020-talking-head,
  author       = "{Shazeer}, Noam and {Lan}, Zhenzhong and {Cheng}, Youlong and
                  {Ding}, Nan and {Hou}, Le",
  title        = "{Talking-Heads Attention}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Machine Learning, Computer Science -
                  Neural and Evolutionary Computing, Computer Science - Sound,
                  Electrical Engineering and Systems Science - Audio and Speech
                  Processing, Statistics - Machine Learning",
  year         = 2020,
  month        = mar,
  eid          = "arXiv:2003.02436",
  pages        = "arXiv:2003.02436",
  archivePrefix= "arXiv",
  eprint       = "2003.02436",
  primaryClass = "cs.LG",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2020arXiv200302436S",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@article{banerjee-2005-cluster-bregman,
  title        = "Clustering with Bregman divergences",
  author       = "Banerjee, Arindam and Merugu, Srujana and Dhillon, Inderjit S
                  and Ghosh, Joydeep",
  journal      = "Journal of machine learning research",
  volume       = 6,
  number       = "Oct",
  pages        = "1705-1749",
  year         = 2005
}

@article{he-2017-dureader,
  author       = "Wei He and Kai Liu and Yajuan Lyu and Shiqi Zhao and Xinyan
                  Xiao and Yuan Liu and Yizhong Wang and Hua Wu and Qiaoqiao
                  She and Xuan Liu and Tian Wu and Haifeng Wang",
  title        = "DuReader: a Chinese Machine Reading Comprehension Dataset
                  from Real-world Applications",
  journal      = "CoRR",
  volume       = "abs/1711.05073",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.05073",
  archivePrefix= "arXiv",
  eprint       = "1711.05073",
  timestamp    = "Thu, 17 Oct 2019 16:06:13 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1711-05073.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@ARTICLE{rajpurkar-2018-squad-2,
  author       = "{Rajpurkar}, Pranav and {Jia}, Robin and {Liang}, Percy",
  title        = "{Know What You Don't Know: Unanswerable Questions for SQuAD}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2018,
  month        = jun,
  eid          = "arXiv:1806.03822",
  pages        = "arXiv:1806.03822",
  archivePrefix= "arXiv",
  eprint       = "1806.03822",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2018arXiv180603822R",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{wang-2017-gated-self,
  title        = "Gated Self-Matching Networks for Reading Comprehension and
                  Question Answering",
  author       = "Wang, Wenhui and Yang, Nan and Wei, Furu and Chang, Baobao
                  and Zhou, Ming",
  booktitle    = "Proceedings of the 55th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2017,
  address      = "Vancouver, Canada",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P17-1018",
  doi          = "10.18653/v1/P17-1018",
  pages        = "189-198",
  abstract     = "In this paper, we present the gated self-matching networks
                  for reading comprehension style question answering, which
                  aims to answer questions from a given passage. We first match
                  the question and passage with gated attention-based recurrent
                  networks to obtain the question-aware passage
                  representation. Then we propose a self-matching attention
                  mechanism to refine the representation by matching the
                  passage against itself, which effectively encodes information
                  from the whole passage. We finally employ the pointer
                  networks to locate the positions of answers from the
                  passages. We conduct extensive experiments on the SQuAD
                  dataset. The single model achieves 71.3{\%} on the evaluation
                  metrics of exact match on the hidden test set, while the
                  ensemble model further boosts the results to 75.9{\%}. At the
                  time of submission of the paper, our model holds the first
                  place on the SQuAD leaderboard for both single and ensemble
                  model."
}
@article{santos-2016-attentive-pooling,
  author       = "Santos, Cicero dos and Tan, Ming and Xiang, Bing and Zhou,
                  Bowen",
  title        = "Attentive Pooling Networks",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1602.03609v1",
  abstract     = "In this work, we propose Attentive Pooling (AP), a two-way
                  attention mechanism for discriminative model training. In the
                  context of pair-wise ranking or classification with neural
                  networks, AP enables the pooling layer to be aware of the
                  current input pair, in a way that information from the two
                  input items can directly influence the computation of each
                  other's representations. Along with such representations of
                  the paired inputs, AP jointly learns a similarity measure
                  over projected segments (e.g. trigrams) of the pair, and
                  subsequently, derives the corresponding attention vector for
                  each input to guide the pooling. Our two-way attention
                  mechanism is a general framework independent of the
                  underlying representation learning, and it has been applied
                  to both convolutional neural networks (CNNs) and recurrent
                  neural networks (RNNs) in our studies. The empirical results,
                  from three very different benchmark tasks of question
                  answering/answer selection, demonstrate that our proposed
                  models outperform a variety of strong baselines and achieve
                  state-of-the-art performance in all the benchmarks.",
  archivePrefix= "arXiv",
  eprint       = "1602.03609",
  primaryClass = "cs.CL"
}

@inproceedings{girdhar-2017-attentional-pooling,
  title        = "Attentional pooling for action recognition",
  author       = "Girdhar, Rohit and Ramanan, Deva",
  booktitle    = "Advances in Neural Information Processing Systems",
  pages        = "34-45",
  year         = 2017
}

@inproceedings{iyyer-2015-word-dropout,
  title        = "Deep Unordered Composition Rivals Syntactic Methods for Text
                  Classification",
  author       = "Iyyer, Mohit and Manjunatha, Varun and Boyd-Graber, Jordan
                  and Daum{\'e} III, Hal",
  booktitle    = "Proceedings of the 53rd Annual Meeting of the Association for
                  Computational Linguistics and the 7th International Joint
                  Conference on Natural Language Processing (Volume 1: Long
                  Papers)",
  month        = jul,
  year         = 2015,
  address      = "Beijing, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P15-1162",
  doi          = "10.3115/v1/P15-1162",
  pages        = "1681-1691"
}

@article{gal-2015-rnn-dropout,
  author       = "Gal, Yarin and Ghahramani, Zoubin",
  title        = "A Theoretically Grounded Application of Dropout in Recurrent
                  Neural Networks",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1512.05287v5",
  abstract     = "Recurrent neural networks (RNNs) stand at the forefront of
                  many recent developments in deep learning. Yet a major
                  difficulty with these models is their tendency to overfit,
                  with dropout shown to fail when applied to recurrent
                  layers. Recent results at the intersection of Bayesian
                  modelling and deep learning offer a Bayesian interpretation
                  of common deep learning techniques such as dropout. This
                  grounding of dropout in approximate Bayesian inference
                  suggests an extension of the theoretical results, offering
                  insights into the use of dropout with RNN models. We apply
                  this new variational inference based dropout technique in
                  LSTM and GRU models, assessing it on language modelling and
                  sentiment analysis tasks. The new approach outperforms
                  existing techniques, and to the best of our knowledge
                  improves on the single model state-of-the-art in language
                  modelling with the Penn Treebank (73.4 test perplexity). This
                  extends our arsenal of variational tools in deep learning.",
  archivePrefix= "arXiv",
  eprint       = "1512.05287",
  primaryClass = "stat.ML"
}

@article{krueger-2016-zoneout,
  author       = "Krueger, David and Maharaj, Tegan and Kram{\'a}r, J{\'a}nos
                  and Pezeshki, Mohammad and Ballas, Nicolas and Ke, Nan
                  Rosemary and Goyal, Anirudh and Bengio, Yoshua and Courville,
                  Aaron and Pal, Chris",
  title        = "Zoneout: Regularizing Rnns By Randomly Preserving Hidden
                  Activations",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1606.01305v4",
  abstract     = "We propose zoneout, a novel method for regularizing RNNs. At
                  each timestep, zoneout stochastically forces some hidden
                  units to maintain their previous values. Like dropout,
                  zoneout uses random noise to train a pseudo-ensemble,
                  improving generalization. But by preserving instead of
                  dropping hidden units, gradient information and state
                  information are more readily propagated through time, as in
                  feedforward stochastic depth networks. We perform an
                  empirical investigation of various RNN regularizers, and find
                  that zoneout gives significant performance improvements
                  across tasks. We achieve competitive results with relatively
                  simple models in character- and word-level language modelling
                  on the Penn Treebank and Text8 datasets, and combining with
                  recurrent batch normalization yields state-of-the-art results
                  on permuted sequential MNIST.",
  archivePrefix= "arXiv",
  eprint       = "1606.01305",
  primaryClass = "cs.NE"
}

@article{merity-2017-drop-connect,
  author       = "Merity, Stephen and Keskar, Nitish Shirish and Socher,
                  Richard",
  title        = "Regularizing and Optimizing Lstm Language Models",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.02182v1",
  abstract     = "Recurrent neural networks (RNNs), such as long short-term
                  memory networks (LSTMs), serve as a fundamental building
                  block for many sequence learning tasks, including machine
                  translation, language modeling, and question answering. In
                  this paper, we consider the specific problem of word-level
                  language modeling and investigate strategies for regularizing
                  and optimizing LSTM-based models. We propose the
                  weight-dropped LSTM which uses DropConnect on
                  hidden-to-hidden weights as a form of recurrent
                  regularization. Further, we introduce NT-ASGD, a variant of
                  the averaged stochastic gradient method, wherein the
                  averaging trigger is determined using a non-monotonic
                  condition as opposed to being tuned by the user. Using these
                  and other regularization strategies, we achieve
                  state-of-the-art word level perplexities on two data sets:
                  57.3 on Penn Treebank and 65.8 on WikiText-2. In exploring
                  the effectiveness of a neural cache in conjunction with our
                  proposed model, we achieve an even lower state-of-the-art
                  perplexity of 52.8 on Penn Treebank and 52.0 on WikiText-2.",
  archivePrefix= "arXiv",
  eprint       = "1708.02182",
  primaryClass = "cs.CL"
}

@article{melis-2017-state-art,
  author       = "Melis, G{\'a}bor and Dyer, Chris and Blunsom, Phil",
  title        = "On the State of the Art of Evaluation in Neural Language
                  Models",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1707.05589v2",
  abstract     = "Ongoing innovations in recurrent neural network architectures
                  have provided a steady influx of apparently state-of-the-art
                  results on language modelling benchmarks. However, these have
                  been evaluated using differing code bases and limited
                  computational resources, which represent uncontrolled sources
                  of experimental variation. We reevaluate several popular
                  architectures and regularisation methods with large-scale
                  automatic black-box hyperparameter tuning and arrive at the
                  somewhat surprising conclusion that standard LSTM
                  architectures, when properly regularised, outperform more
                  recent models. We establish a new state of the art on the
                  Penn Treebank and Wikitext-2 corpora, as well as strong
                  baselines on the Hutter Prize dataset.",
  archivePrefix= "arXiv",
  eprint       = "1707.05589",
  primaryClass = "cs.CL"
}
@article{merity-2017-activation-regularization,
  author       = "Merity, Stephen and McCann, Bryan and Socher, Richard",
  title        = "Revisiting Activation Regularization for Language Rnns",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.01009v1",
  abstract     = "Recurrent neural networks (RNNs) serve as a fundamental
                  building block for many sequence tasks across natural
                  language processing. Recent research has focused on recurrent
                  dropout techniques or custom RNN cells in order to improve
                  performance. Both of these can require substantial
                  modifications to the machine learning model or to the
                  underlying RNN configurations. We revisit traditional
                  regularization techniques, specifically L2 regularization on
                  RNN activations and slowness regularization over successive
                  hidden states, to improve the performance of RNNs on the task
                  of language modeling. Both of these techniques require
                  minimal modification to existing RNN architectures and result
                  in performance improvements comparable or superior to more
                  complicated regularization techniques or custom cell
                  architectures. These regularization techniques can be used
                  without any modification on optimized LSTM implementations
                  such as the NVIDIA cuDNN LSTM.",
  archivePrefix= "arXiv",
  eprint       = "1708.01009",
  primaryClass = "cs.CL"
}

@article{ma-2016-expectation-linear-dropout,
  author       = "Ma, Xuezhe and Gao, Yingkai and Hu, Zhiting and Yu, Yaoliang
                  and Deng, Yuntian and Hovy, Eduard",
  title        = "Dropout With Expectation-Linear Regularization",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1609.08017v3",
  abstract     = "Dropout, a simple and effective way to train deep neural
                  networks, has led to a number of impressive empirical
                  successes and spawned many recent theoretical
                  investigations. However, the gap between dropout's training
                  and inference phases, introduced due to tractability
                  considerations, has largely remained under-appreciated. In
                  this work, we first formulate dropout as a tractable
                  approximation of some latent variable model, leading to a
                  clean view of parameter sharing and enabling further
                  theoretical analysis. Then, we introduce (approximate)
                  expectation-linear dropout neural networks, whose inference
                  gap we are able to formally characterize. Algorithmically, we
                  show that our proposed measure of the inference gap can be
                  used to regularize the standard dropout training objective,
                  resulting in an \emph{explicit} control of the gap.  Our
                  method is as simple and efficient as standard dropout. We
                  further prove the upper bounds on the loss in accuracy due to
                  expectation-linearization, describe classes of input
                  distributions that expectation-linearize easily. Experiments
                  on three image classification benchmark datasets demonstrate
                  that reducing the inference gap can indeed improve the
                  performance consistently.",
  archivePrefix= "arXiv",
  eprint       = "1609.08017",
  primaryClass = "cs.LG"
}

@inproceedings{clare-2001-ml-dt,
  author       = "Clare, Amanda and King, Ross D.",
  title        = "Knowledge Discovery in Multi-Label Phenotype Data",
  year         = 2001,
  isbn         = 3540425349,
  publisher    = "Springer-Verlag",
  address      = "Berlin, Heidelberg",
  booktitle    = "Proceedings of the 5th European Conference on Principles of
                  Data Mining and Knowledge Discovery",
  pages        = "42–53",
  numpages     = 12,
  series       = "PKDD ’01"
}

@inproceedings{elisseeff-2001-rank-svm,
  author       = "Elisseeff, Andr\'{e} and Weston, Jason",
  title        = "A Kernel Method for Multi-Labelled Classification",
  year         = 2001,
  publisher    = "MIT Press",
  address      = "Cambridge, MA, USA",
  booktitle    = "Proceedings of the 14th International Conference on Neural
                  Information Processing Systems: Natural and Synthetic",
  pages        = "681–687",
  numpages     = 7,
  location     = "Vancouver, British Columbia, Canada",
  series       = "NIPS’01"
}

@article{zhang-2007-ml-knn,
  author       = "Zhang, Min-Ling and Zhou, Zhi-Hua",
  title        = "ML-KNN: A Lazy Learning Approach to Multi-Label Learning",
  year         = 2007,
  issue_date   = "July 2007",
  publisher    = "Elsevier Science Inc.",
  address      = "USA",
  volume       = 40,
  number       = 7,
  issn         = "0031-3203",
  url          = "https://doi.org/10.1016/j.patcog.2006.12.019",
  doi          = "10.1016/j.patcog.2006.12.019",
  journal      = "Pattern Recogn.",
  month        = jul,
  pages        = "2038–2048",
  numpages     = 11,
  keywords     = "Lazy learning, maximum a posteriori, Text categorization,
                  KNN, PMM, K-nearest neighbor, Multi-label learning, Natural
                  scene classification, Machine learning, Functional genomics,
                  ML-KNN, parametric mixture model, MAP, multi-label K-nearest
                  neighbor"
}

@inproceedings{papineni-2002-bleu,
  author       = "Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu,
                  Wei-Jing",
  title        = "{B}leu: a Method for Automatic Evaluation of Machine
                  Translation",
  booktitle    = "Proceedings of the 40th Annual Meeting of the Association for
                  Computational Linguistics",
  year         = 2002,
  pages        = "311-318",
  doi          = "10.3115/1073083.1073135",
  url          = "https://doi.org/10.3115/1073083.1073135",
  address      = "Philadelphia, Pennsylvania, USA",
  month        = jul,
  publisher    = "Association for Computational Linguistics"
}

@article{vijayakumar-2016-diverse-beam-search,
  author       = "Vijayakumar, Ashwin K and Cogswell, Michael and Selvaraju,
                  Ramprasath R. and Sun, Qing and Lee, Stefan and Crandall,
                  David and Batra, Dhruv",
  title        = "Diverse Beam Search: Decoding Diverse Solutions From Neural
                  Sequence Models",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1610.02424v2",
  abstract     = "Neural sequence models are widely used to model time-series
                  data. Equally ubiquitous is the usage of beam search (BS) as
                  an approximate inference algorithm to decode output sequences
                  from these models. BS explores the search space in a greedy
                  left-right fashion retaining only the top-B candidates -
                  resulting in sequences that differ only slightly from each
                  other. Producing lists of nearly identical sequences is not
                  only computationally wasteful but also typically fails to
                  capture the inherent ambiguity of complex AI tasks. To
                  overcome this problem, we propose Diverse Beam Search (DBS),
                  an alternative to BS that decodes a list of diverse outputs
                  by optimizing for a diversity-augmented objective. We observe
                  that our method finds better top-1 solutions by controlling
                  for the exploration and exploitation of the search space -
                  implying that DBS is a better search algorithm. Moreover,
                  these gains are achieved with minimal computational or memory
                  over- head as compared to beam search. To demonstrate the
                  broad applicability of our method, we present results on
                  image captioning, machine translation and visual question
                  generation using both standard quantitative metrics and
                  qualitative human studies.  Further, we study the role of
                  diversity for image-grounded language generation tasks as the
                  complexity of the image changes. We observe that our method
                  consistently outperforms BS and previously proposed
                  techniques for diverse decoding from neural sequence models.",
  archivePrefix= "arXiv",
  eprint       = "1610.02424",
  primaryClass = "cs.AI"
}

@article{huszar-2015-schedule-sampling-problem,
  journal      = "CoRR",
  title        = "How (not) to Train your Generative Model: Scheduled Sampling,
                  Likelihood, Adversary?",
  author       = "Husz{\'a}r, Ferenc",
  archivePrefix= "arXiv",
  year         = 2015,
  eprint       = "1511.05101",
  primaryClass = "stat.ML",
  abstract     = "Modern applications and progress in deep learning research
                  have created renewed interest for generative models of text
                  and of images. However, even today it is unclear what
                  objective functions one should use to train and evaluate
                  these models. In this paper we present two contributions.
                  Firstly, we present a critique of scheduled sampling, a
                  state-of-the-art training method that contributed to the
                  winning entry to the MSCOCO image captioning benchmark in
                  2015. Here we show that despite this impressive empirical
                  performance, the objective function underlying scheduled
                  sampling is improper and leads to an inconsistent learning
                  algorithm.  Secondly, we revisit the problems that scheduled
                  sampling was meant to address, and present an alternative
                  interpretation. We argue that maximum likelihood is an
                  inappropriate training objective when the end-goal is to
                  generate natural-looking samples. We go on to derive an ideal
                  objective function to use in this situation instead. We
                  introduce a generalisation of adversarial training, and show
                  how such method can interpolate between maximum likelihood
                  training and our ideal training objective. To our knowledge
                  this is the first theoretical analysis that explains why
                  adversarial training tends to produce samples with higher
                  perceived quality.",
  url          = "http://arxiv.org/abs/1511.05101v1"
}

@article{lamb-2016-professor-forcing,
  author       = "Lamb, Alex and Goyal, Anirudh and Zhang, Ying and Zhang,
                  Saizheng and Courville, Aaron and Bengio, Yoshua",
  title        = "Professor Forcing: a New Algorithm for Training Recurrent
                  Networks",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1610.09038v1",
  abstract     = "The Teacher Forcing algorithm trains recurrent networks by
                  supplying observed sequence values as inputs during training
                  and using the network's own one-step-ahead predictions to do
                  multi-step sampling. We introduce the Professor Forcing
                  algorithm, which uses adversarial domain adaptation to
                  encourage the dynamics of the recurrent network to be the
                  same when training the network and when sampling from the
                  network over multiple time steps. We apply Professor Forcing
                  to language modeling, vocal synthesis on raw waveforms,
                  handwriting generation, and image generation. Empirically we
                  find that Professor Forcing acts as a regularizer, improving
                  test likelihood on character level Penn Treebank and
                  sequential MNIST. We also find that the model qualitatively
                  improves samples, especially when sampling for a large number
                  of time steps. This is supported by human evaluation of
                  sample quality. Trade-offs between Professor Forcing and
                  Scheduled Sampling are discussed. We produce T-SNEs showing
                  that Professor Forcing successfully makes the dynamics of the
                  network during training and sampling more similar.",
  archivePrefix= "arXiv",
  eprint       = "1610.09038",
  primaryClass = "stat.ML"
}

@inproceedings{zhang-2019-train-infer-gap,
  title        = "Bridging the Gap between Training and Inference for Neural
                  Machine Translation",
  author       = "Zhang, Wen and Feng, Yang and Meng, Fandong and You, Di and
                  Liu, Qun",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1426",
  doi          = "10.18653/v1/P19-1426",
  pages        = "4334-4343",
  abstract     = "Neural Machine Translation (NMT) generates target words
                  sequentially in the way of predicting the next word
                  conditioned on the context words. At training time, it
                  predicts with the ground truth words as context while at
                  inference it has to generate the entire sequence from
                  scratch. This discrepancy of the fed context leads to error
                  accumulation among the way. Furthermore, word-level training
                  requires strict matching between the generated sequence and
                  the ground truth sequence which leads to overcorrection over
                  different but reasonable translations. In this paper, we
                  address these issues by sampling context words not only from
                  the ground truth sequence but also from the predicted
                  sequence by the model during training, where the predicted
                  sequence is selected with a sentence-level
                  optimum. Experiment results on Chinese-{\textgreater}English
                  and WMT{'}14 English-{\textgreater}German translation tasks
                  demonstrate that our approach can achieve significant
                  improvements on multiple datasets."
}
@article{hinton-2015-soft-target,
  author       = "Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff",
  title        = "Distilling the Knowledge in a Neural Network",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1503.02531v1",
  abstract     = "A very simple way to improve the performance of almost any
                  machine learning algorithm is to train many different models
                  on the same data and then to average their
                  predictions. Unfortunately, making predictions using a whole
                  ensemble of models is cumbersome and may be too
                  computationally expensive to allow deployment to a large
                  number of users, especially if the individual models are
                  large neural nets. Caruana and his collaborators have shown
                  that it is possible to compress the knowledge in an ensemble
                  into a single model which is much easier to deploy and we
                  develop this approach further using a different compression
                  technique. We achieve some surprising results on MNIST and we
                  show that we can significantly improve the acoustic model of
                  a heavily used commercial system by distilling the knowledge
                  in an ensemble of models into a single model. We also
                  introduce a new type of ensemble composed of one or more full
                  models and many specialist models which learn to distinguish
                  fine-grained classes that the full models confuse. Unlike a
                  mixture of experts, these specialist models can be trained
                  rapidly and in parallel.",
  archivePrefix= "arXiv",
  eprint       = "1503.02531",
  primaryClass = "stat.ML"
}

@article{tang-2015-soft-target,
  author       = "Tang, Zhiyuan and Wang, Dong and Zhang, Zhiyong",
  title        = "Recurrent Neural Network Training With Dark Knowledge
                  Transfer",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1505.04630v5",
  abstract     = "Recurrent neural networks (RNNs), particularly long
                  short-term memory (LSTM), have gained much attention in
                  automatic speech recognition (ASR). Although some successful
                  stories have been reported, training RNNs remains highly
                  challenging, especially with limited training data. Recent
                  research found that a well-trained model can be used as a
                  teacher to train other child models, by using the predictions
                  generated by the teacher model as supervision. This knowledge
                  transfer learning has been employed to train simple neural
                  nets with a complex one, so that the final performance can
                  reach a level that is infeasible to obtain by regular
                  training. In this paper, we employ the knowledge transfer
                  learning approach to train RNNs (precisely LSTM) using a deep
                  neural network (DNN) model as the teacher. This is different
                  from most of the existing research on knowledge transfer
                  learning, since the teacher (DNN) is assumed to be weaker
                  than the child (RNN); however, our experiments on an ASR task
                  showed that it works fairly well: without applying any tricks
                  on the learning scheme, this approach can train RNNs
                  successfully even with limited training data.",
  archivePrefix= "arXiv",
  eprint       = "1505.04630",
  primaryClass = "stat.ML"
}
@article{you-2017-lars,
  author       = "You, Yang and Gitman, Igor and Ginsburg, Boris",
  title        = "Large Batch Training of Convolutional Networks",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.03888v3",
  abstract     = "A common way to speed up training of large convolutional
                  networks is to add computational units. Training is then
                  performed using data-parallel synchronous Stochastic Gradient
                  Descent (SGD) with mini-batch divided between computational
                  units. With an increase in the number of nodes, the batch
                  size grows. But training with large batch size often results
                  in the lower model accuracy. We argue that the current recipe
                  for large batch training (linear learning rate scaling with
                  warm-up) is not general enough and training may diverge. To
                  overcome this optimization difficulties we propose a new
                  training algorithm based on Layer-wise Adaptive Rate Scaling
                  (LARS). Using LARS, we scaled Alexnet up to a batch size of
                  8K, and Resnet-50 to a batch size of 32K without loss in
                  accuracy.",
  archivePrefix= "arXiv",
  eprint       = "1708.03888",
  primaryClass = "cs.CV"
}

@article{le-2015-identity-rnn,
  author       = "Le, Quoc V. and Jaitly, Navdeep and Hinton, Geoffrey E.",
  title        = "A Simple Way To Initialize Recurrent Networks of Rectified
                  Linear Units",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1504.00941v2",
  abstract     = "Learning long term dependencies in recurrent networks is
                  difficult due to vanishing and exploding gradients. To
                  overcome this difficulty, researchers have developed
                  sophisticated optimization techniques and network
                  architectures.  In this paper, we propose a simpler solution
                  that use recurrent neural networks composed of rectified
                  linear units. Key to our solution is the use of the identity
                  matrix or its scaled version to initialize the recurrent
                  weight matrix. We find that our solution is comparable to
                  LSTM on our four benchmarks: two toy problems involving
                  long-range temporal structures, a large language modeling
                  problem and a benchmark speech recognition problem.",
  archivePrefix= "arXiv",
  eprint       = "1504.00941",
  primaryClass = "cs.NE"
}

@inproceedings{bi-2013-efficient-multi-label,
  author       = "Bi, Wei and Kwok, James T.",
  title        = "Efficient Multi-Label Classification with Many Labels",
  year         = 2013,
  publisher    = "JMLR.org",
  booktitle    = "Proceedings of the 30th International Conference on
                  International Conference on Machine Learning - Volume 28",
  pages        = "III–405–III–413",
  numpages     = 9,
  location     = "Atlanta, GA, USA",
  series       = "ICML’13"
}

@article{raffel-2019-t5,
  author       = "Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee,
                  Katherine and Narang, Sharan and Matena, Michael and Zhou,
                  Yanqi and Li, Wei and Liu, Peter J.",
  title        = "Exploring the Limits of Transfer Learning With a Unified
                  Text-To-Text Transformer",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1910.10683v2",
  abstract     = "Transfer learning, where a model is first pre-trained on a
                  data-rich task before being fine-tuned on a downstream task,
                  has emerged as a powerful technique in natural language
                  processing (NLP). The effectiveness of transfer learning has
                  given rise to a diversity of approaches, methodology, and
                  practice. In this paper, we explore the landscape of transfer
                  learning techniques for NLP by introducing a unified
                  framework that converts every language problem into a
                  text-to-text format. Our systematic study compares
                  pre-training objectives, architectures, unlabeled datasets,
                  transfer approaches, and other factors on dozens of language
                  understanding tasks. By combining the insights from our
                  exploration with scale and our new ``Colossal Clean Crawled
                  Corpus``, we achieve state-of-the-art results on many
                  benchmarks covering summarization, question answering, text
                  classification, and more. To facilitate future work on
                  transfer learning for NLP, we release our dataset,
                  pre-trained models, and code.",
  archivePrefix= "arXiv",
  eprint       = "1910.10683",
  primaryClass = "cs.LG"
}
@inproceedings{kolitsas-2018-end-to-end-el,
  author       = "Kolitsas, Nikolaos and Ganea, Octavian-Eugen and Hofmann,
                  Thomas",
  title        = "End-to-End Neural Entity Linking",
  booktitle    = "Proceedings of the 22nd Conference on Computational Natural
                  Language Learning",
  year         = 2018,
  pages        = "519-529",
  doi          = "10.18653/v1/K18-1050",
  url          = "https://doi.org/10.18653/v1/K18-1050",
  abstract     = "Entity Linking (EL) is an essential task for semantic text
                  understanding and information extraction. Popular methods
                  separately address the Mention Detection (MD) and Entity
                  Disambiguation (ED) stages of EL, without leveraging their
                  mutual dependency. We here propose the first neural
                  end-to-end EL system that jointly discovers and links
                  entities in a text document. The main idea is to consider all
                  possible spans as potential mentions and learn contextual
                  similarity scores over their entity candidates that are
                  useful for both MD and ED decisions. Key components are
                  context-aware mention embeddings, entity embeddings and a
                  probabilistic mention - entity map, without demanding other
                  engineered features. Empirically, we show that our end-to-end
                  method significantly outperforms popular systems on the
                  Gerbil platform when enough training data is
                  available. Conversely, if testing datasets follow different
                  annotation conventions compared to the training set
                  (e.g. queries/ tweets vs news documents), our ED model
                  coupled with a traditional NER system offers the best or
                  second best EL accuracy.",
  address      = "Brussels, Belgium",
  month        = oct,
  publisher    = "Association for Computational Linguistics"
}

@article{raiman-2018-deeptype,
  author       = "Raiman, Jonathan and Raiman, Olivier",
  title        = "Deeptype: Multilingual Entity Linking By Neural Type System
                  Evolution",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1802.01021v1",
  abstract     = "The wealth of structured (e.g. Wikidata) and unstructured
                  data about the world available today presents an incredible
                  opportunity for tomorrow's Artificial Intelligence. So far,
                  integration of these two different modalities is a difficult
                  process, involving many decisions concerning how best to
                  represent the information so that it will be captured or
                  useful, and hand-labeling large amounts of data. DeepType
                  overcomes this challenge by explicitly integrating symbolic
                  information into the reasoning process of a neural network
                  with a type system. First we construct a type system, and
                  second, we use it to constrain the outputs of a neural
                  network to respect the symbolic structure. We achieve this by
                  reformulating the design problem into a mixed integer
                  problem: create a type system and subsequently train a neural
                  network with it. In this reformulation discrete variables
                  select which parent-child relations from an ontology are
                  types within the type system, while continuous variables
                  control a classifier fit to the type system. The original
                  problem cannot be solved exactly, so we propose a 2-step
                  algorithm: 1) heuristic search or stochastic optimization
                  over discrete variables that define a type system informed by
                  an Oracle and a Learnability heuristic, 2) gradient descent
                  to fit classifier parameters. We apply DeepType to the
                  problem of Entity Linking on three standard datasets
                  (i.e. WikiDisamb30, CoNLL (YAGO), TAC KBP 2010) and find that
                  it outperforms all existing solutions by a wide margin,
                  including approaches that rely on a human-designed type
                  system or recent deep learning-based entity embeddings, while
                  explicitly using symbolic information lets it integrate new
                  entities without retraining.",
  archivePrefix= "arXiv",
  eprint       = "1802.01021",
  primaryClass = "cs.CL"
}

@inproceedings{le-2018-el-latent-relation,
  title        = "Improving Entity Linking by Modeling Latent Relations between
                  Mentions",
  author       = "Le, Phong and Titov, Ivan",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1148",
  doi          = "10.18653/v1/P18-1148",
  pages        = "1595-1604",
  abstract     = "Entity linking involves aligning textual mentions of named
                  entities to their corresponding entries in a knowledge
                  base. Entity linking systems often exploit relations between
                  textual mentions in a document (e.g., coreference) to decide
                  if the linking decisions are compatible. Unlike previous
                  approaches, which relied on supervised systems or heuristics
                  to predict these relations, we treat relations as latent
                  variables in our neural entity-linking model. We induce the
                  relations without any supervision while optimizing the
                  entity-linking system in an end-to-end fashion. Our
                  multi-relational model achieves the best reported scores on
                  the standard benchmark (AIDA-CoNLL) and substantially
                  outperforms its relation-agnostic version. Its training also
                  converges much faster, suggesting that the injected
                  structural bias helps to explain regularities in the training
                  data."
}

@inproceedings{ganea-2017-deep-ed,
  title        = "Deep Joint Entity Disambiguation with Local Neural Attention",
  author       = "Ganea, Octavian-Eugen and Hofmann, Thomas",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1277",
  doi          = "10.18653/v1/D17-1277",
  pages        = "2619-2629",
  abstract     = "We propose a novel deep learning model for joint
                  document-level entity disambiguation, which leverages learned
                  neural representations. Key components are entity embeddings,
                  a neural attention mechanism over local context windows, and
                  a differentiable joint inference stage for
                  disambiguation. Our approach thereby combines benefits of
                  deep learning with more traditional approaches such as
                  graphical models and probabilistic mention-entity
                  maps. Extensive experiments show that we are able to obtain
                  competitive or state-of-the-art accuracy at moderate
                  computational costs."
}

@article{vashishth-2020-medtype,
  author       = "Vashishth, Shikhar and Joshi, Rishabh and Dutt, Ritam and
                  Newman-Griffis, Denis and Rose, Carolyn",
  title        = "Medtype: Improving Medical Entity Linking With Semantic Type
                  Prediction",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2005.00460v1",
  abstract     = "Medical entity linking is the task of identifying and
                  standardizing concepts referred in a scientific article or
                  clinical record. Existing methods adopt a two-step approach
                  of detecting mentions and identifying a list of candidate
                  concepts for them. In this paper, we probe the impact of
                  incorporating an entity disambiguation step in existing
                  entity linkers. For this, we present MedType, a novel method
                  that leverages the surrounding context to identify the
                  semantic type of a mention and uses it for filtering out
                  candidate concepts of the wrong types. We further present two
                  novel largescale, automatically-created datasets of medical
                  entity mentions: WIKIMED, a Wikipediabased dataset for
                  cross-domain transfer learning, and PUBMEDDS, a
                  distantly-supervised dataset of medical entity mentions in
                  biomedical abstracts. Through extensive experiments across
                  several datasets and methods, we demonstrate that MedType
                  pre-trained on our proposed datasets substantially improve
                  medical entity linking and gives state-of-the-art
                  performance. We make our source code and datasets publicly
                  available for medical entity linking research.",
  archivePrefix= "arXiv",
  eprint       = "2005.00460",
  primaryClass = "cs.CL"
}

@article{shi-2020-sentence-level-el,
  author       = "Shi, Wei and Zhang, Siyuan and Zhang, Zhiwei and Cheng, Hong
                  and Yu, Jeffrey Xu",
  title        = "Joint Embedding in Named Entity Linking on Sentence Level",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2002.04936v1",
  abstract     = "Named entity linking is to map an ambiguous mention in
                  documents to an entity in a knowledge base. The named entity
                  linking is challenging, given the fact that there are
                  multiple candidate entities for a mention in a document. It
                  is difficult to link a mention when it appears multiple times
                  in a document, since there are conflicts by the contexts
                  around the appearances of the mention. In addition, it is
                  difficult since the given training dataset is small due to
                  the reason that it is done manually to link a mention to its
                  mapping entity. In the literature, there are many reported
                  studies among which the recent embedding methods learn
                  vectors of entities from the training dataset at document
                  level.  To address these issues, we focus on how to link
                  entity for mentions at a sentence level, which reduces the
                  noises introduced by different appearances of the same
                  mention in a document at the expense of insufficient
                  information to be used. We propose a new unified embedding
                  method by maximizing the relationships learned from knowledge
                  graphs. We confirm the effectiveness of our method in our
                  experimental studies.",
  archivePrefix= "arXiv",
  eprint       = "2002.04936",
  primaryClass = "cs.CL"
}

@article{broscheit-2020-bert-el,
  author       = "Broscheit, Samuel",
  title        = "Investigating Entity Knowledge in Bert With Simple Neural
                  End-To-End Entity Linking",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2003.05473v1",
  abstract     = "A typical architecture for end-to-end entity linking systems
                  consists of three steps: mention detection, candidate
                  generation and entity disambiguation.  In this study we
                  investigate the following questions: (a) Can all those steps
                  be learned jointly with a model for contextualized
                  text-representations, i.e.  BERT (Devlin et al., 2019)? (b)
                  How much entity knowledge is already contained in pretrained
                  BERT? (c) Does additional entity knowledge improve BERT's
                  performance in downstream tasks? To this end, we propose an
                  extreme simplification of the entity linking setup that works
                  surprisingly well: simply cast it as a per token
                  classification over the entire entity vocabulary (over 700K
                  classes in our case). We show on an entity linking benchmark
                  that (i) this model improves the entity representations over
                  plain BERT, (ii) that it outperforms entity linking
                  architectures that optimize the tasks separately and (iii)
                  that it only comes second to the current state-of-the-art
                  that does mention detection and entity disambiguation
                  jointly. Additionally, we investigate the usefulness of
                  entity-aware token-representations in the text-understanding
                  benchmark GLUE, as well as the question answering benchmarks
                  SQUAD V2 and SWAG and also the EN-DE WMT14 machine
                  translation benchmark. To our surprise, we find that most of
                  those benchmarks do not benefit from additional entity
                  knowledge, except for a task with very small training data,
                  the RTE task in GLUE, which improves by 2 \%.",
  archivePrefix= "arXiv",
  eprint       = "2003.05473",
  primaryClass = "cs.CL"
}

@article{chen-2020-latent-entity-type,
  author       = "Chen, Shuang and Wang, Jinpeng and Jiang, Feng and Lin,
                  Chin-Yew",
  title        = "Improving Entity Linking By Modeling Latent Entity Type
                  Information",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2001.01447v1",
  abstract     = "Existing state of the art neural entity linking models employ
                  attention-based bag-of-words context model and pre-trained
                  entity embeddings bootstrapped from word embeddings to assess
                  topic level context compatibility. However, the latent entity
                  type information in the immediate context of the mention is
                  neglected, which causes the models often link mentions to
                  incorrect entities with incorrect type. To tackle this
                  problem, we propose to inject latent entity type information
                  into the entity embeddings based on pre-trained BERT. In
                  addition, we integrate a BERT-based entity similarity score
                  into the local context model of a state-of-the-art model to
                  better capture latent entity type information. Our model
                  significantly outperforms the state-of-the-art entity linking
                  models on standard benchmark (AIDA-CoNLL). Detailed
                  experiment analysis demonstrates that our model corrects most
                  of the type errors produced by the direct baseline.",
  archivePrefix= "arXiv",
  eprint       = "2001.01447",
  primaryClass = "cs.CL"
}

@article{zhu-2019-latte,
  author       = "Zhu, Ming and Celikkaya, Busra and Bhatia, Parminder and
                  Reddy, Chandan K.",
  title        = "Latte: Latent Type Modeling for Biomedical Entity Linking",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1911.09787v2",
  abstract     = "Entity linking is the task of linking mentions of named
                  entities in natural language text, to entities in a curated
                  knowledge-base. This is of significant importance in the
                  biomedical domain, where it could be used to semantically
                  annotate a large volume of clinical records and biomedical
                  literature, to standardized concepts described in an ontology
                  such as Unified Medical Language System (UMLS). We observe
                  that with precise type information, entity disambiguation
                  becomes a straightforward task. However, fine-grained type
                  information is usually not available in biomedical
                  domain. Thus, we propose LATTE, a LATent Type Entity Linking
                  model, that improves entity linking by modeling the latent
                  fine-grained type information about mentions and entities.
                  Unlike previous methods that perform entity linking directly
                  between the mentions and the entities, LATTE jointly does
                  entity disambiguation, and latent fine-grained type learning,
                  without direct supervision. We evaluate our model on two
                  biomedical datasets: MedMentions, a large scale public
                  dataset annotated with UMLS concepts, and a de-identified
                  corpus of dictated doctor's notes that has been annotated
                  with ICD concepts. Extensive experimental evaluation shows
                  our model achieves significant performance improvements over
                  several state-of-the-art techniques.",
  archivePrefix= "arXiv",
  eprint       = "1911.09787",
  primaryClass = "cs.CL"
}

@article{chen-2019-yelm,
  author       = "Chen, Haotian and Wadhwa, Sahil and Li, Xi David and
                  Zukov-Gregoric, Andrej",
  title        = "Yelm: End-To-End Contextualized Entity Linking",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1911.03834v1",
  abstract     = "We propose yet another entity linking model (YELM) which
                  links words to entities instead of spans. This overcomes any
                  difficulties associated with the selection of good candidate
                  mention spans and makes the joint training of mention
                  detection (MD) and entity disambiguation (ED) easily
                  possible. Our model is based on BERT and produces
                  contextualized word embeddings which are trained against a
                  joint MD and ED objective. We achieve state-of-the-art
                  results on several standard entity linking (EL) datasets.",
  archivePrefix= "arXiv",
  eprint       = "1911.03834",
  primaryClass = "cs.CL"
}

@article{martins-2019-joint-ner-el,
  author       = "Martins, Pedro Henrique and Marinho, Zita and Martins,
                  Andr{\'e} F. T.",
  title        = "Joint Learning of Named Entity Recognition and Entity
                  Linking",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.08243v1",
  abstract     = "Named entity recognition (NER) and entity linking (EL) are
                  two fundamentally related tasks, since in order to perform
                  EL, first the mentions to entities have to be
                  detected. However, most entity linking approaches disregard
                  the mention detection part, assuming that the correct
                  mentions have been previously detected. In this paper, we
                  perform joint learning of NER and EL to leverage their
                  relatedness and obtain a more robust and generalisable
                  system. For that, we introduce a model inspired by the
                  Stack-LSTM approach (Dyer et al., 2015).  We observe that, in
                  fact, doing multi-task learning of NER and EL improves the
                  performance in both tasks when comparing with models trained
                  with individual objectives. Furthermore, we achieve results
                  competitive with the state-of-the-art in both NER and EL.",
  archivePrefix= "arXiv",
  eprint       = "1907.08243",
  primaryClass = "cs.CL"
}

@inproceedings{logeswaran-2019-zero-shot-el,
  title        = "Zero-Shot Entity Linking by Reading Entity Descriptions",
  author       = "Logeswaran, Lajanugen and Chang, Ming-Wei and Lee, Kenton and
                  Toutanova, Kristina and Devlin, Jacob and Lee, Honglak",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1335",
  doi          = "10.18653/v1/P19-1335",
  pages        = "3449-3460",
  abstract     = "We present the zero-shot entity linking task, where mentions
                  must be linked to unseen entities without in-domain labeled
                  data. The goal is to enable robust transfer to highly
                  specialized domains, and so no metadata or alias tables are
                  assumed. In this setting, entities are only identified by
                  text descriptions, and models must rely strictly on language
                  understanding to resolve the new entities. First, we show
                  that strong reading comprehension models pre-trained on large
                  unlabeled data can be used to generalize to unseen
                  entities. Second, we propose a simple and effective adaptive
                  pre-training strategy, which we term domain-adaptive
                  pre-training (DAP), to address the domain shift problem
                  associated with linking unseen entities in a new domain. We
                  present experiments on a new dataset that we construct for
                  this task and show that DAP improves over strong pre-training
                  baselines, including BERT. The data and code are available at
                  https://github.com/lajanugen/zeshel."
}

@inproceedings{le-2019-distant-el,
  title        = "Distant Learning for Entity Linking with Automatic Noise
                  Detection",
  author       = "Le, Phong and Titov, Ivan",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1400",
  doi          = "10.18653/v1/P19-1400",
  pages        = "4081-4090",
  abstract     = "Accurate entity linkers have been produced for domains and
                  languages where annotated data (i.e., texts linked to a
                  knowledge base) is available. However, little progress has
                  been made for the settings where no or very limited amounts
                  of labeled data are present (e.g., legal or most scientific
                  domains). In this work, we show how we can learn to link
                  mentions without having any labeled examples, only a
                  knowledge base and a collection of unannotated texts from the
                  corresponding domain. In order to achieve this, we frame the
                  task as a multi-instance learning problem and rely on surface
                  matching to create initial noisy labels. As the learning
                  signal is weak and our surrogate labels are noisy, we
                  introduce a noise detection component in our model: it lets
                  the model detect and disregard examples which are likely to
                  be noisy. Our method, jointly learning to detect noise and
                  link entities, greatly outperforms the surface matching
                  baseline. For a subset of entity categories, it even
                  approaches the performance of supervised learning."
}

@inproceedings{mondal-2019-triplet-network-el,
  title        = "Medical Entity Linking using Triplet Network",
  author       = "Mondal, Ishani and Purkayastha, Sukannya and Sarkar, Sudeshna
                  and Goyal, Pawan and Pillai, Jitesh and Bhattacharyya,
                  Amitava and Gattu, Mahanandeeshwar",
  booktitle    = "Proceedings of the 2nd Clinical Natural Language Processing
                  Workshop",
  month        = jun,
  year         = 2019,
  address      = "Minneapolis, Minnesota, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W19-1912",
  doi          = "10.18653/v1/W19-1912",
  pages        = "95-100",
  abstract     = "Entity linking (or Normalization) is an essential task in
                  text mining that maps the entity mentions in the medical text
                  to standard entities in a given Knowledge Base (KB). This
                  task is of great importance in the medical domain. It can
                  also be used for merging different medical and clinical
                  ontologies. In this paper, we center around the problem of
                  disease linking or normalization. This task is executed in
                  two phases: candidate generation and candidate scoring. In
                  this paper, we present an approach to rank the candidate
                  Knowledge Base entries based on their similarity with disease
                  mention. We make use of the Triplet Network for candidate
                  ranking. While the existing methods have used carefully
                  generated sieves and external resources for candidate
                  generation, we introduce a robust and portable candidate
                  generation scheme that does not make use of the hand-crafted
                  rules. Experimental results on the standard benchmark NCBI
                  disease dataset demonstrate that our system outperforms the
                  prior methods by a significant margin."
}

@article{yang-2019-dca,
  author       = "Yang, Xiyuan and Gu, Xiaotao and Lin, Sheng and Tang, Siliang
                  and Zhuang, Yueting and Wu, Fei and Chen, Zhigang and Hu,
                  Guoping and Ren, Xiang",
  title        = "Learning Dynamic Context Augmentation for Global Entity
                  Linking",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1909.02117v1",
  abstract     = "Despite of the recent success of collective entity linking
                  (EL) methods, these ``global`` inference methods may yield
                  sub-optimal results when the ``all-mention coherence``
                  assumption breaks, and often suffer from high computational
                  cost at the inference stage, due to the complex search
                  space. In this paper, we propose a simple yet effective
                  solution, called Dynamic Context Augmentation (DCA), for
                  collective EL, which requires only one pass through the
                  mentions in a document. DCA sequentially accumulates context
                  information to make efficient, collective inference, and can
                  cope with different local EL models as a plug-and-enhance
                  module. We explore both supervised and reinforcement learning
                  strategies for learning the DCA model. Extensive experiments
                  show the effectiveness of our model with different learning
                  settings, base models, decision orders and attention
                  mechanisms.",
  archivePrefix= "arXiv",
  eprint       = "1909.02117",
  primaryClass = "cs.CL"
}

@inproceedings{murty-2018-hierarchical-losses,
  title        = "Hierarchical Losses and New Resources for Fine-grained Entity
                  Typing and Linking",
  author       = "Murty, Shikhar and Verga, Patrick and Vilnis, Luke and
                  Radovanovic, Irena and McCallum, Andrew",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1010",
  doi          = "10.18653/v1/P18-1010",
  pages        = "97-109",
  abstract     = "Extraction from raw text to a knowledge base of entities and
                  fine-grained types is often cast as prediction into a flat
                  set of entity and type labels, neglecting the rich
                  hierarchies over types and entities contained in curated
                  ontologies. Previous attempts to incorporate hierarchical
                  structure have yielded little benefit and are restricted to
                  shallow ontologies. This paper presents new methods using
                  real and complex bilinear mappings for integrating
                  hierarchical information, yielding substantial improvement
                  over flat predictions in entity linking and fine-grained
                  entity typing, and achieving new state-of-the-art results for
                  end-to-end models on the benchmark FIGER dataset. We also
                  present two new human-annotated datasets containing wide and
                  deep hierarchies which we will release to the community to
                  encourage further research in this direction:
                  \textit{MedMentions}, a collection of PubMed abstracts in
                  which 246k mentions have been mapped to the massive UMLS
                  ontology; and \textit{TypeNet}, which aligns Freebase types
                  with the WordNet hierarchy to obtain nearly 2k entity
                  types. In experiments on all three datasets we show
                  substantial gains from hierarchy-aware training."
}

@inproceedings{zhong-2018-colink,
  title        = "Colink: An unsupervised framework for user identity linkage",
  author       = "Zhong, Zexuan and Cao, Yong and Guo, Mu and Nie, Zaiqing",
  booktitle    = "Thirty-Second AAAI Conference on Artificial Intelligence",
  year         = 2018
}

@inproceedings{du-2019-extract-symptoms,
  title        = "Extracting Symptoms and their Status from Clinical
                  Conversations",
  author       = "Du, Nan and Chen, Kai and Kannan, Anjuli and Tran, Linh and
                  Chen, Yuhui and Shafran, Izhak",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1087",
  doi          = "10.18653/v1/P19-1087",
  pages        = "915-925",
  abstract     = "This paper describes novel models tailored for a new
                  application, that of extracting the symptoms mentioned in
                  clinical conversations along with their status. Lack of any
                  publicly available corpus in this privacy-sensitive domain
                  led us to develop our own corpus, consisting of about 3K
                  conversations annotated by professional medical scribes. We
                  propose two novel deep learning approaches to infer the
                  symptom names and their status: (1) a new hierarchical
                  span-attribute tagging (SA-T) model, trained using curriculum
                  learning, and (2) a variant of sequence-to-sequence model
                  which decodes the symptoms and their status from a few
                  speaker turns within a sliding window over the
                  conversation. This task stems from a realistic application of
                  assisting medical providers in capturing symptoms mentioned
                  by patients from their clinical conversations. To reflect
                  this application, we define multiple metrics. From
                  inter-rater agreement, we find that the task is inherently
                  difficult. We conduct comprehensive evaluations on several
                  contrasting conditions and observe that the performance of
                  the models range from an F-score of 0.5 to 0.8 depending on
                  the condition. Our analysis not only reveals the inherent
                  challenges of the task, but also provides useful directions
                  to improve the models."
}

@article{sarrouti-2020-sembionlqa,
  title        = "SemBioNLQA: A semantic biomedical question answering system
                  for retrieving exact and ideal answers to natural language
                  questions",
  journal      = "Artificial Intelligence in Medicine",
  volume       = 102,
  pages        = 101767,
  year         = 2020,
  issn         = "0933-3657",
  doi          = "https://doi.org/10.1016/j.artmed.2019.101767",
  url          =
                  "http://www.sciencedirect.com/science/article/pii/S0933365718302756",
  author       = "Mourad Sarrouti and Said [Ouatik El Alaoui]",
  keywords     = "Biomedical question answering, Information retrieval, Passage
                  retrieval, Natural language processing, Machine learning,
                  Biomedical informatics, BioASQ",
  abstract     = "Background and objective Question answering (QA), the
                  identification of short accurate answers to users questions
                  written in natural language expressions, is a longstanding
                  issue widely studied over the last decades in the
                  open-domain. However, it still remains a real challenge in
                  the biomedical domain as the most of the existing systems
                  support a limited amount of question and answer types as well
                  as still require further efforts in order to improve their
                  performance in terms of precision for the supported
                  questions. Here, we present a semantic biomedical QA system
                  named SemBioNLQA which has the ability to handle the kinds of
                  yes/no, factoid, list, and summary natural language
                  questions.  Methods This paper describes the system
                  architecture and an evaluation of the developed end-to-end
                  biomedical QA system named SemBioNLQA, which consists of
                  question classification, document retrieval, passage
                  retrieval and answer extraction modules. It takes natural
                  language questions as input, and outputs both short precise
                  answers and summaries as results. The SemBioNLQA system,
                  dealing with four types of questions, is based on (1)
                  handcrafted lexico-syntactic patterns and a machine learning
                  algorithm for question classification, (2) PubMed search
                  engine and UMLS similarity for document retrieval, (3) the
                  BM25 model, stemmed words and UMLS concepts for passage
                  retrieval, and (4) UMLS metathesaurus, BioPortal synonyms,
                  sentiment analysis and term frequency metric for answer
                  extraction.  Results and conclusion Compared with the current
                  state-of-the-art biomedical QA systems, SemBioNLQA, a fully
                  automated system, has the potential to deal with a large
                  amount of question and answer types. SemBioNLQA retrieves
                  quickly users’ information needs by returning exact answers
                  (e.g., “yes”, “no”, a biomedical entity name, etc.) and ideal
                  answers (i.e., paragraph-sized summaries of relevant
                  information) for yes/no, factoid and list questions, whereas
                  it provides only the ideal answers for summary
                  questions. Moreover, experimental evaluations performed on
                  biomedical questions and answers provided by the BioASQ
                  challenge especially in 2015, 2016 and 2017 (as part of our
                  participation), show that SemBioNLQA achieves good
                  performances compared with the most current state-of-the-art
                  systems and allows a practical and competitive alternative to
                  help information seekers find exact and ideal answers to
                  their biomedical questions. The SemBioNLQA source code is
                  publicly available at
                  https://github.com/sarrouti/sembionlqa."
}

@article{demner-fushman-2019-health-qa,
  title        = "Consumer health information and question answering: helping
                  consumers find answers to their health-related information
                  needs",
  author       = "Dina Demner-Fushman and Yassine Mrabet and Asma Ben Abacha",
  journal      = "Journal of the American Medical Informatics Association :
                  JAMIA",
  year         = 2019
}

@inproceedings{lin-2019-symptom-graph,
  title        = "Enhancing Dialogue Symptom Diagnosis with Global Attention
                  and Symptom Graph",
  author       = "Lin, Xinzhu and He, Xiahui and Chen, Qin and Tou, Huaixiao
                  and Wei, Zhongyu and Chen, Ting",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1508",
  doi          = "10.18653/v1/D19-1508",
  pages        = "5033-5042",
  abstract     = "Symptom diagnosis is a challenging yet profound problem in
                  natural language processing. Most previous research focus on
                  investigating the standard electronic medical records for
                  symptom diagnosis, while the dialogues between doctors and
                  patients that contain more rich information are not well
                  studied. In this paper, we first construct a dialogue symptom
                  diagnosis dataset based on an online medical forum with a
                  large amount of dialogues between patients and doctors. Then,
                  we provide some benchmark models on this dataset to boost the
                  research of dialogue symptom diagnosis. In order to further
                  enhance the performance of symptom diagnosis over dialogues,
                  we propose a global attention mechanism to capture more
                  symptom related information, and build a symptom graph to
                  model the associations between symptoms rather than treating
                  each symptom independently. Experimental results show that
                  both the global attention and symptom graph are effective to
                  boost dialogue symptom diagnosis. In particular, our proposed
                  model achieves the state-of-the-art performance on the
                  constructed dataset."
}

@inproceedings{dusek-2016-context-aware,
  title        = "A Context-aware Natural Language Generator for Dialogue
                  Systems",
  author       = "Du{\v{s}}ek, Ond{\v{r}}ej and Jur{\v{c}}{\'\i}{\v{c}}ek,
                  Filip",
  booktitle    = "Proceedings of the 17th Annual Meeting of the Special
                  Interest Group on Discourse and Dialogue",
  month        = sep,
  year         = 2016,
  address      = "Los Angeles",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W16-3622",
  doi          = "10.18653/v1/W16-3622",
  pages        = "185-190"
}

@inproceedings{ghosal-2019-dialogue-gcn,
  title        = "{D}ialogue{GCN}: A Graph Convolutional Neural Network for
                  Emotion Recognition in Conversation",
  author       = "Ghosal, Deepanway and Majumder, Navonil and Poria, Soujanya
                  and Chhaya, Niyati and Gelbukh, Alexander",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1015",
  doi          = "10.18653/v1/D19-1015",
  pages        = "154-164",
  abstract     = "Emotion recognition in conversation (ERC) has received much
                  attention, lately, from researchers due to its potential
                  widespread applications in diverse areas, such as
                  health-care, education, and human resources. In this paper,
                  we present Dialogue Graph Convolutional Network
                  (DialogueGCN), a graph neural network based approach to
                  ERC. We leverage self and inter-speaker dependency of the
                  interlocutors to model conversational context for emotion
                  recognition. Through the graph network, DialogueGCN addresses
                  context propagation issues present in the current RNN-based
                  methods. We empirically show that this method alleviates such
                  issues, while outperforming the current state of the art on a
                  number of benchmark emotion classification datasets."
}

@inproceedings{chen-2019-working-memory,
  title        = "A Working Memory Model for Task-oriented Dialog Response
                  Generation",
  author       = "Chen, Xiuyi and Xu, Jiaming and Xu, Bo",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1258",
  doi          = "10.18653/v1/P19-1258",
  pages        = "2687-2693",
  abstract     = "Recently, to incorporate external Knowledge Base (KB)
                  information, one form of world knowledge, several end-to-end
                  task-oriented dialog systems have been proposed. These
                  models, however, tend to confound the dialog history with KB
                  tuples and simply store them into one memory. Inspired by the
                  psychological studies on working memory, we propose a working
                  memory model (WMM2Seq) for dialog response generation. Our
                  WMM2Seq adopts a working memory to interact with two
                  separated long-term memories, which are the episodic memory
                  for memorizing dialog history and the semantic memory for
                  storing KB tuples. The working memory consists of a central
                  executive to attend to the aforementioned memories, and a
                  short-term storage system to store the {``}activated{''}
                  contents from the long-term memories. Furthermore, we
                  introduce a context-sensitive perceptual process for the
                  token representations of dialog history, and then feed them
                  into the episodic memory. Extensive experiments on two
                  task-oriented dialog datasets demonstrate that our WMM2Seq
                  significantly outperforms the state-of-the-art results in
                  several evaluation metrics."
}

@inproceedings{su-2019-utterance-rewriter,
  title        = "Improving Multi-turn Dialogue Modelling with Utterance
                  {R}e{W}riter",
  author       = "Su, Hui and Shen, Xiaoyu and Zhang, Rongzhi and Sun, Fei and
                  Hu, Pengwei and Niu, Cheng and Zhou, Jie",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1003",
  doi          = "10.18653/v1/P19-1003",
  pages        = "22-31",
  abstract     = "Recent research has achieved impressive results in
                  single-turn dialogue modelling. In the multi-turn setting,
                  however, current models are still far from satisfactory. One
                  major challenge is the frequently occurred coreference and
                  information omission in our daily conversation, making it
                  hard for machines to understand the real intention. In this
                  paper, we propose rewriting the human utterance as a
                  pre-process to help multi-turn dialgoue modelling. Each
                  utterance is first rewritten to recover all coreferred and
                  omitted information. The next processing steps are then
                  performed based on the rewritten utterance. To properly train
                  the utterance rewriter, we collect a new dataset with human
                  annotations and introduce a Transformer-based utterance
                  rewriting architecture using the pointer network. We show the
                  proposed architecture achieves remarkably good performance on
                  the utterance rewriting task. The trained utterance rewriter
                  can be easily integrated into online chatbots and brings
                  general improvement over different domains."
}

@inproceedings{ippolito-2019-decoding-methods,
  title        = "Comparison of Diverse Decoding Methods from Conditional
                  Language Models",
  author       = "Ippolito, Daphne and Kriz, Reno and Sedoc, Jo{\~a}o and
                  Kustikova, Maria and Callison-Burch, Chris",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1365",
  doi          = "10.18653/v1/P19-1365",
  pages        = "3752-3762",
  abstract     = "While conditional language models have greatly improved in
                  their ability to output high quality natural language, many
                  NLP applications benefit from being able to generate a
                  diverse set of candidate sequences. Diverse decoding
                  strategies aim to, within a given-sized candidate list, cover
                  as much of the space of high-quality outputs as possible,
                  leading to improvements for tasks that rerank and combine
                  candidate outputs. Standard decoding methods, such as beam
                  search, optimize for generating high likelihood sequences
                  rather than diverse ones, though recent work has focused on
                  increasing diversity in these methods. In this work, we
                  perform an extensive survey of decoding-time strategies for
                  generating diverse outputs from a conditional language
                  model. In addition, we present a novel method where we
                  over-sample candidates, then use clustering to remove similar
                  sequences, thus achieving high diversity without sacrificing
                  quality."
}

@inproceedings{qian-2019-daml,
  title        = "Domain Adaptive Dialog Generation via Meta Learning",
  author       = "Qian, Kun and Yu, Zhou",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1253",
  doi          = "10.18653/v1/P19-1253",
  pages        = "2639-2649",
  abstract     = "Domain adaptation is an essential task in dialog system
                  building because there are so many new dialog tasks created
                  for different needs every day. Collecting and annotating
                  training data for these new tasks is costly since it involves
                  real user interactions. We propose a domain adaptive dialog
                  generation method based on meta-learning (DAML). DAML is an
                  end-to-end trainable dialog system model that learns from
                  multiple rich-resource tasks and then adapts to new domains
                  with minimal training samples. We train a dialog system model
                  using multiple rich-resource single-domain dialog data by
                  applying the model-agnostic meta-learning algorithm to dialog
                  domain. The model is capable of learning a competitive dialog
                  system on a new domain with only a few training examples in
                  an efficient manner. The two-step gradient updates in DAML
                  enable the model to learn general features across multiple
                  tasks. We evaluate our method on a simulated dialog dataset
                  and achieve state-of-the-art performance, which is
                  generalizable to new tasks."
}

@inproceedings{sankar-2019-conversation-history,
  title        = "Do Neural Dialog Systems Use the Conversation History
                  Effectively? An Empirical Study",
  author       = "Sankar, Chinnadhurai and Subramanian, Sandeep and Pal, Chris
                  and Chandar, Sarath and Bengio, Yoshua",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1004",
  doi          = "10.18653/v1/P19-1004",
  pages        = "32-37",
  abstract     = "Neural generative models have been become increasingly
                  popular when building conversational agents. They offer
                  flexibility, can be easily adapted to new domains, and
                  require minimal domain engineering. A common criticism of
                  these systems is that they seldom understand or use the
                  available dialog history effectively. In this paper, we take
                  an empirical approach to understanding how these models use
                  the available dialog history by studying the sensitivity of
                  the models to artificially introduced unnatural changes or
                  perturbations to their context at test time. We experiment
                  with 10 different types of perturbations on 4 multi-turn
                  dialog datasets and find that commonly used neural dialog
                  architectures like recurrent and transformer-based seq2seq
                  models are rarely sensitive to most perturbations such as
                  missing or reordering utterances, shuffling words, etc. Also,
                  by open-sourcing our code, we believe that it will serve as a
                  useful diagnostic tool for evaluating dialog systems in the
                  future."
}

@inproceedings{quan-2019-gecor,
  title        = "{GECOR}: An End-to-End Generative Ellipsis and Co-reference
                  Resolution Model for Task-Oriented Dialogue",
  author       = "Quan, Jun and Xiong, Deyi and Webber, Bonnie and Hu,
                  Changjian",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1462",
  doi          = "10.18653/v1/D19-1462",
  pages        = "4547-4557",
  abstract     = "Ellipsis and co-reference are common and ubiquitous
                  especially in multi-turn dialogues. In this paper, we treat
                  the resolution of ellipsis and co-reference in dialogue as a
                  problem of generating omitted or referred expressions from
                  the dialogue context. We therefore propose a unified
                  end-to-end Generative Ellipsis and CO-reference Resolution
                  model (GECOR) in the context of dialogue. The model can
                  generate a new pragmatically complete user utterance by
                  alternating the generation and copy mode for each user
                  utterance. A multi-task learning framework is further
                  proposed to integrate the GECOR into an end-to-end
                  task-oriented dialogue. In order to train both the GECOR and
                  the multi-task learning framework, we manually construct a
                  new dataset on the basis of the public dataset CamRest676
                  with both ellipsis and co-reference annotation. On this
                  dataset, intrinsic evaluations on the resolution of ellipsis
                  and co-reference show that the GECOR model significantly
                  outperforms the sequence-to-sequence (seq2seq) baseline model
                  in terms of EM, BLEU and F1 while extrinsic evaluations on
                  the downstream dialogue task demonstrate that our multi-task
                  learning framework with GECOR achieves a higher success rate
                  of task completion than TSCP, a state-of-the-art end-to-end
                  task-oriented dialogue model."
}

@inproceedings{zhao-2018-zsdg,
  title        = "Zero-Shot Dialog Generation with Cross-Domain Latent Actions",
  author       = "Zhao, Tiancheng and Eskenazi, Maxine",
  booktitle    = "Proceedings of the 19th Annual {SIG}dial Meeting on Discourse
                  and Dialogue",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W18-5001",
  doi          = "10.18653/v1/W18-5001",
  pages        = "1-10",
  abstract     = "This paper introduces zero-shot dialog generation (ZSDG), as
                  a step towards neural dialog systems that can instantly
                  generalize to new situations with minimum data. ZSDG requires
                  an end-to-end generative dialog system to generalize to a new
                  domain for which only a domain description is provided and no
                  training dialogs are available. Then a novel learning
                  framework, Action Matching, is proposed. This algorithm can
                  learn a cross-domain embedding space that models the
                  semantics of dialog responses which in turn, enables a neural
                  dialog generation model to generalize to new domains. We
                  evaluate our methods on two datasets, a new synthetic dialog
                  dataset, and an existing human-human multi-domain dialog
                  dataset. Experimental results show that our method is able to
                  achieve superior performance in learning dialog models that
                  can rapidly adapt their behavior to new domains and suggests
                  promising future research."
}

@article{zhao-2018-unsupervised-dg,
  author       = "Tiancheng Zhao and Kyusong Lee and Maxine Esk{\'{e}}nazi",
  title        = "Unsupervised Discrete Sentence Representation Learning for
                  Interpretable Neural Dialog Generation",
  journal      = "CoRR",
  volume       = "abs/1804.08069",
  year         = 2018,
  url          = "http://arxiv.org/abs/1804.08069",
  archivePrefix= "arXiv",
  eprint       = "1804.08069",
  timestamp    = "Mon, 13 Aug 2018 16:46:01 +0200",
  biburl       = "https://dblp.org/rec/journals/corr/abs-1804-08069.bib",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{shalyminov-2019-few-shot-dg,
  title        = "Few-Shot Dialogue Generation Without Annotated Data: A
                  Transfer Learning Approach",
  author       = "Shalyminov, Igor and Lee, Sungjin and Eshghi, Arash and
                  Lemon, Oliver",
  booktitle    = "Proceedings of the 20th Annual SIGdial Meeting on Discourse
                  and Dialogue",
  month        = sep,
  year         = 2019,
  address      = "Stockholm, Sweden",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W19-5904",
  doi          = "10.18653/v1/W19-5904",
  pages        = "32-39",
  abstract     = "Learning with minimal data is one of the key challenges in
                  the development of practical, production-ready goal-oriented
                  dialogue systems. In a real-world enterprise setting where
                  dialogue systems are developed rapidly and are expected to
                  work robustly for an ever-growing variety of domains,
                  products, and scenarios, efficient learning from a limited
                  number of examples becomes indispensable. In this paper, we
                  introduce a technique to achieve state-of-the-art dialogue
                  generation performance in a few-shot setup, without using any
                  annotated data. We do this by leveraging background knowledge
                  from a larger, more highly represented dialogue source {---}
                  namely, the MetaLWOz dataset. We evaluate our model on the
                  Stanford Multi-Domain Dialogue Dataset, consisting of
                  human-human goal-oriented dialogues in in-car navigation,
                  appointment scheduling, and weather information domains. We
                  show that our few-shot approach achieves state-of-the art
                  results on that dataset by consistently outperforming the
                  previous best model in terms of BLEU and Entity F1 scores,
                  while being more data-efficient than it by not requiring any
                  data annotation."
}

@inproceedings{lei-2018-sequicity,
  title        = "{S}equicity: Simplifying Task-oriented Dialogue Systems with
                  Single Sequence-to-Sequence Architectures",
  author       = "Lei, Wenqiang and Jin, Xisen and Kan, Min-Yen and Ren,
                  Zhaochun and He, Xiangnan and Yin, Dawei",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1133",
  doi          = "10.18653/v1/P18-1133",
  pages        = "1437-1447",
  abstract     = "Existing solutions to task-oriented dialogue systems follow
                  pipeline designs which introduces architectural complexity
                  and fragility. We propose a novel, holistic, extendable
                  framework based on a single sequence-to-sequence (seq2seq)
                  model which can be optimized with supervised or reinforcement
                  learning. A key contribution is that we design text spans
                  named belief spans to track dialogue believes, allowing
                  task-oriented dialogue systems to be modeled in a seq2seq
                  way. Based on this, we propose a simplistic Two Stage CopyNet
                  instantiation which emonstrates good scalability:
                  significantly reducing model complexity in terms of number of
                  parameters and training time by a magnitude. It significantly
                  outperforms state-of-the-art pipeline-based methods on large
                  datasets and retains a satisfactory entity match rate on
                  out-of-vocabulary (OOV) cases where pipeline-designed
                  competitors totally fail."
}

@article{liu-2019-nmrc-methods,
  author       = "Liu, Shanshan and Zhang, Xin and Zhang, Sheng and Wang, Hui
                  and Zhang, Weiming",
  title        = "Neural Machine Reading Comprehension: Methods and Trends",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1907.01118v5",
  abstract     = "Machine reading comprehension (MRC), which requires a machine
                  to answer questions based on a given context, has attracted
                  increasing attention with the incorporation of various
                  deep-learning techniques over the past few years.  Although
                  research on MRC based on deep learning is flourishing, there
                  remains a lack of a comprehensive survey summarizing existing
                  approaches and recent trends, which motivated the work
                  presented in this article. Specifically, we give a thorough
                  review of this research field, covering different aspects
                  including (1) typical MRC tasks: their definitions,
                  differences, and representative datasets; (2) the general
                  architecture of neural MRC: the main modules and prevalent
                  approaches to each; and (3) new trends: some emerging areas
                  in neural MRC as well as the corresponding
                  challenges. Finally, considering what has been achieved so
                  far, the survey also envisages what the future may hold by
                  discussing the open issues left to be addressed.",
  archivePrefix= "arXiv",
  eprint       = "1907.01118",
  primaryClass = "cs.CL"
}

@phdthesis{chen-2018-nrc-beyond,
  title        = "Neural reading comprehension and beyond",
  author       = "Chen, Danqi",
  year         = 2018,
  school       = "Stanford University"
}

@inproceedings{trotman-2014-improve-bm25,
  author       = "Trotman, Andrew and Puurula, Antti and Burgess, Blake",
  title        = "Improvements to BM25 and Language Models Examined",
  year         = 2014,
  isbn         = 9781450330008,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2682862.2682863",
  doi          = "10.1145/2682862.2682863",
  booktitle    = "Proceedings of the 2014 Australasian Document Computing
                  Symposium",
  pages        = "58–65",
  numpages     = 8,
  keywords     = "Procrastination, Document Retrieval, Relevance Ranking",
  location     = "Melbourne, VIC, Australia",
  series       = "ADCS ’14"
}

@article{nogueira-2019-bert-re-ranking,
  author       = "Nogueira, Rodrigo and Cho, Kyunghyun",
  title        = "Passage Re-Ranking With Bert",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.04085v5",
  abstract     = "Recently, neural models pretrained on a language modeling
                  task, such as ELMo (Peters et al., 2017), OpenAI GPT (Radford
                  et al., 2018), and BERT (Devlin et al., 2018), have achieved
                  impressive results on various natural language processing
                  tasks such as question-answering and natural language
                  inference. In this paper, we describe a simple
                  re-implementation of BERT for query-based passage
                  re-ranking. Our system is the state of the art on the
                  TREC-CAR dataset and the top entry in the leaderboard of the
                  MS MARCO passage retrieval task, outperforming the previous
                  state of the art by 27 \% (relative) in MRR@10. The code to
                  reproduce our results is available at
                  https://github.com/nyu-dl/dl4marco-bert",
  archivePrefix= "arXiv",
  eprint       = "1901.04085",
  primaryClass = "cs.IR"
}

@article{bajaj-2016-ms-marco,
  author       = "Bajaj, Payal and Campos, Daniel and Craswell, Nick and Deng,
                  Li and Gao, Jianfeng and Liu, Xiaodong and Majumder, Rangan
                  and McNamara, Andrew and Mitra, Bhaskar and Nguyen, Tri and
                  Rosenberg, Mir and Song, Xia and Stoica, Alina and Tiwary,
                  Saurabh and Wang, Tong",
  title        = "Ms Marco: a Human Generated Machine Reading Comprehension
                  Dataset",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.09268v3",
  abstract     = "We introduce a large scale MAchine Reading COmprehension
                  dataset, which we name MS MARCO. The dataset comprises of
                  1,010,916 anonymized questions---sampled from Bing's search
                  query logs---each with a human generated answer and 182,669
                  completely human rewritten generated answers. In addition,
                  the dataset contains 8,841,823 passages---extracted from
                  3,563,535 web documents retrieved by Bing---that provide the
                  information necessary for curating the natural language
                  answers. A question in the MS MARCO dataset may have multiple
                  answers or no answers at all. Using this dataset, we propose
                  three different tasks with varying levels of difficulty: (i)
                  predict if a question is answerable given a set of context
                  passages, and extract and synthesize the answer as a human
                  would (ii) generate a well-formed answer (if possible) based
                  on the context passages that can be understood with the
                  question and passage context, and finally (iii) rank a set of
                  retrieved passages given a question. The size of the dataset
                  and the fact that the questions are derived from real user
                  search queries distinguishes MS MARCO from other well-known
                  publicly available datasets for machine reading comprehension
                  and question-answering. We believe that the scale and the
                  real-world nature of this dataset makes it attractive for
                  benchmarking machine reading comprehension and
                  question-answering models.",
  archivePrefix= "arXiv",
  eprint       = "1611.09268",
  primaryClass = "cs.CL"
}

@article{qiao-2019-bert-re-ranking,
  author       = "Qiao, Yifan and Xiong, Chenyan and Liu, Zhenghao and Liu,
                  Zhiyuan",
  title        = "Understanding the Behaviors of Bert in Ranking",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.07531v4",
  abstract     = "This paper studies the performances and behaviors of BERT in
                  ranking tasks.  We explore several different ways to leverage
                  the pre-trained BERT and fine-tune it on two ranking tasks:
                  MS MARCO passage reranking and TREC Web Track ad hoc document
                  ranking. Experimental results on MS MARCO demonstrate the
                  strong effectiveness of BERT in question-answering focused
                  passage ranking tasks, as well as the fact that BERT is a
                  strong interaction-based seq2seq matching model. Experimental
                  results on TREC show the gaps between the BERT pre-trained on
                  surrounding contexts and the needs of ad hoc document
                  ranking.  Analyses illustrate how BERT allocates its
                  attentions between query-document tokens in its Transformer
                  layers, how it prefers semantic matches between paraphrase
                  tokens, and how that differs with the soft match patterns
                  learned by a click-trained neural ranker.",
  archivePrefix= "arXiv",
  eprint       = "1904.07531",
  primaryClass = "cs.IR"
}

@article{pei-2019-re-ranking-recommendation,
  author       = "Pei, Changhua and Zhang, Yi and Zhang, Yongfeng and Sun, Fei
                  and Lin, Xiao and Sun, Hanxiao and Wu, Jian and Jiang, Peng
                  and Ou, Wenwu",
  title        = "Personalized Re-Ranking for Recommendation",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.06813v3",
  abstract     = "Ranking is a core task in recommender systems, which aims at
                  providing an ordered list of items to users. Typically, a
                  ranking function is learned from the labeled dataset to
                  optimize the global performance, which produces a ranking
                  score for each individual item. However, it may be
                  sub-optimal because the scoring function applies to each item
                  individually and does not explicitly consider the mutual
                  influence between items, as well as the differences of users'
                  preferences or intents. Therefore, we propose a personalized
                  re-ranking model for recommender systems. The proposed
                  re-ranking model can be easily deployed as a follow-up
                  modular after any ranking algorithm, by directly using the
                  existing ranking feature vectors. It directly optimizes the
                  whole recommendation list by employing a transformer
                  structure to efficiently encode the information of all items
                  in the list. Specifically, the Transformer applies a
                  self-attention mechanism that directly models the global
                  relationships between any pair of items in the whole list. We
                  confirm that the performance can be further improved by
                  introducing pre-trained embedding to learn personalized
                  encoding functions for different users. Experimental results
                  on both offline benchmarks and real-world online e-commerce
                  systems demonstrate the significant improvements of the
                  proposed re-ranking model.",
  archivePrefix= "arXiv",
  eprint       = "1904.06813",
  primaryClass = "cs.IR"
}

@inproceedings{kratzwald-2019-rankqa,
  title        = "{R}ank{QA}: Neural Question Answering with Answer Re-Ranking",
  author       = "Kratzwald, Bernhard and Eigenmann, Anna and Feuerriegel,
                  Stefan",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2019,
  address      = "Florence, Italy",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P19-1611",
  doi          = "10.18653/v1/P19-1611",
  pages        = "6076-6085",
  abstract     = "The conventional paradigm in neural question answering (QA)
                  for narrative content is limited to a two-stage process:
                  first, relevant text passages are retrieved and,
                  subsequently, a neural network for machine comprehension
                  extracts the likeliest answer. However, both stages are
                  largely isolated in the status quo and, hence, information
                  from the two phases is never properly fused. In contrast,
                  this work proposes RankQA: RankQA extends the conventional
                  two-stage process in neural QA with a third stage that
                  performs an additional answer re-ranking. The re-ranking
                  leverages different features that are directly extracted from
                  the QA pipeline, i.e., a combination of retrieval and
                  comprehension features. While our intentionally simple design
                  allows for an efficient, data-sparse estimation, it
                  nevertheless outperforms more complex QA systems by a
                  significant margin: in fact, RankQA achieves state-of-the-art
                  performance on 3 out of 4 benchmark datasets. Furthermore,
                  its performance is especially superior in settings where the
                  size of the corpus is dynamic. Here the answer re-ranking
                  provides an effective remedy against the underlying
                  noise-information trade-off due to a variable corpus size. As
                  a consequence, RankQA represents a novel, powerful, and thus
                  challenging baseline for future research in content-based
                  QA."
}
@article{guu-2020-realm,
  author       = "Guu, Kelvin and Lee, Kenton and Tung, Zora and Pasupat,
                  Panupong and Chang, Ming-Wei",
  title        = "Realm: Retrieval-Augmented Language Model Pre-Training",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2002.08909v1",
  abstract     = "Language model pre-training has been shown to capture a
                  surprising amount of world knowledge, crucial for NLP tasks
                  such as question answering. However, this knowledge is stored
                  implicitly in the parameters of a neural network, requiring
                  ever-larger networks to cover more facts.  To capture
                  knowledge in a more modular and interpretable way, we augment
                  language model pre-training with a latent knowledge
                  retriever, which allows the model to retrieve and attend over
                  documents from a large corpus such as Wikipedia, used during
                  pre-training, fine-tuning and inference. For the first time,
                  we show how to pre-train such a knowledge retriever in an
                  unsupervised manner, using masked language modeling as the
                  learning signal and backpropagating through a retrieval step
                  that considers millions of documents.  We demonstrate the
                  effectiveness of Retrieval-Augmented Language Model
                  pre-training (REALM) by fine-tuning on the challenging task
                  of Open-domain Question Answering (Open-QA). We compare
                  against state-of-the-art models for both explicit and
                  implicit knowledge storage on three popular Open-QA
                  benchmarks, and find that we outperform all previous methods
                  by a significant margin (4-16 \% absolute accuracy), while
                  also providing qualitative benefits such as interpretability
                  and modularity.",
  archivePrefix= "arXiv",
  eprint       = "2002.08909",
  primaryClass = "cs.CL"
}

@article{yang-2019-bert-ad-hoc-doc,
  author       = "Yang, Wei and Zhang, Haotian and Lin, Jimmy",
  title        = "Simple Applications of Bert for Ad Hoc Document Retrieval",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.10972v1",
  abstract     = "Following recent successes in applying BERT to question
                  answering, we explore simple applications to ad hoc document
                  retrieval. This required confronting the challenge posed by
                  documents that are typically longer than the length of input
                  BERT was designed to handle. We address this issue by
                  applying inference on sentences individually, and then
                  aggregating sentence scores to produce document
                  scores. Experiments on TREC microblog and newswire test
                  collections show that our approach is simple yet effective,
                  as we report the highest average precision on these datasets
                  by neural approaches that we are aware of.",
  archivePrefix= "arXiv",
  eprint       = "1903.10972",
  primaryClass = "cs.IR"
}
@article{kowsari-2017-hdltex,
  author       = "Kowsari, Kamran and Brown, Donald E. and Heidarysafa, Mojtaba
                  and Meimandi, Kiana Jafari and Gerber, Matthew S. and Barnes,
                  Laura E.",
  title        = "Hdltex: Hierarchical Deep Learning for Text Classification",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1709.08267v2",
  abstract     = "The continually increasing number of documents produced each
                  year necessitates ever improving information processing
                  methods for searching, retrieving, and organizing
                  text. Central to these information processing methods is
                  document classification, which has become an important
                  application for supervised learning. Recently the performance
                  of these traditional classifiers has degraded as the number
                  of documents has increased. This is because along with this
                  growth in the number of documents has come an increase in the
                  number of categories. This paper approaches this problem
                  differently from current document classification methods that
                  view the problem as multi-class classification. Instead we
                  perform hierarchical classification using an approach we call
                  Hierarchical Deep Learning for Text classification
                  (HDLTex). HDLTex employs stacks of deep learning
                  architectures to provide specialized understanding at each
                  level of the document hierarchy.",
  archivePrefix= "arXiv",
  eprint       = "1709.08267",
  primaryClass = "cs.LG"
}

@article{shen-2014-entity-linking-solution,
  title        = "Entity linking with a knowledge base: Issues, techniques, and
                  solutions",
  author       = "Shen, Wei and Wang, Jianyong and Han, Jiawei",
  journal      = "IEEE Transactions on Knowledge and Data Engineering",
  volume       = 27,
  number       = 2,
  pages        = "443-460",
  year         = 2014,
  publisher    = "IEEE"
}

@inproceedings{ehrlinger-2016-kg-definition,
  added-at     = "2017-12-16T11:15:46.000+0100",
  author       = "Ehrlinger, Lisa and W{\"o}{\ss}, Wolfram",
  biburl       =
                  "https://www.bibsonomy.org/bibtex/2bef3c699eeb69778c02467ccc13bc99c/thoni",
  booktitle    = "SEMANTiCS (Posters, Demos, SuCCESS)",
  interhash    = "33750938d78af869dd800db08b39c1b8",
  intrahash    = "bef3c699eeb69778c02467ccc13bc99c",
  keywords     = "knowledge graph defintion citedby:scholar:count:4
                  citedby:scholar:timestamp:2017-12-16",
  timestamp    = "2017-12-16T11:15:46.000+0100",
  title        = "Towards a Definition of Knowledge Graphs.",
  year         = 2016
}

@article{shen-2005-pairwise,
  title        = "Ranking and reranking with perceptron",
  author       = "Shen, Libin and Joshi, Aravind K",
  journal      = "Machine Learning",
  volume       = 60,
  number       = "1-3",
  pages        = "73-96",
  year         = 2005,
  publisher    = "Springer"
}

@inproceedings{cao-2007-listwise,
  title        = "Learning to rank: from pairwise approach to listwise
                  approach",
  author       = "Cao, Zhe and Qin, Tao and Liu, Tie-Yan and Tsai, Ming-Feng
                  and Li, Hang",
  booktitle    = "Proceedings of the 24th international conference on Machine
                  learning",
  pages        = "129-136",
  year         = 2007
}

@inproceedings{zheng-2010-learn-link,
  title        = "Learning to Link Entities with Knowledge Base",
  author       = "Zheng, Zhicheng and Li, Fangtao and Huang, Minlie and Zhu,
                  Xiaoyan",
  booktitle    = "Human Language Technologies: The 2010 Annual Conference of
                  the North {A}merican Chapter of the Association for
                  Computational Linguistics",
  month        = jun,
  year         = 2010,
  address      = "Los Angeles, California",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N10-1072",
  pages        = "483-491"
}

@inproceedings{chen-2011-collaborative-ranking,
  title        = "Collaborative Ranking: A Case Study on Entity Linking",
  author       = "Chen, Zheng and Ji, Heng",
  booktitle    = "Proceedings of the 2011 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = jul,
  year         = 2011,
  address      = "Edinburgh, Scotland, UK.",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D11-1071",
  pages        = "771-781"
}

@inproceedings{han-2011-generative-el,
  title        = "A Generative Entity-Mention Model for Linking Entities with
                  Knowledge Base",
  author       = "Han, Xianpei and Sun, Le",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies",
  month        = jun,
  year         = 2011,
  address      = "Portland, Oregon, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P11-1095",
  pages        = "945-954"
}

@inproceedings{ngomo-2011-limes,
  title        = "LIMES—a time-efficient approach for large-scale link
                  discovery on the web of data",
  author       = "Ngomo, Axel-Cyrille Ngonga and Auer, S{\"o}ren",
  booktitle    = "Twenty-Second International Joint Conference on Artificial
                  Intelligence",
  year         = 2011
}


@article{sil-2017-cross-lingual-el,
  author       = "Sil, Avirup and Kundu, Gourab and Florian, Radu and Hamza,
                  Wael",
  title        = "Neural Cross-Lingual Entity Linking",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1712.01813v1",
  abstract     = "A major challenge in Entity Linking (EL) is making effective
                  use of contextual information to disambiguate mentions to
                  Wikipedia that might refer to different entities in different
                  contexts. The problem exacerbates with cross-lingual EL which
                  involves linking mentions written in non-English documents to
                  entries in the English Wikipedia: to compare textual clues
                  across languages we need to compute similarity between
                  textual fragments across languages. In this paper, we propose
                  a neural EL model that trains fine-grained similarities and
                  dissimilarities between the query and candidate document from
                  multiple perspectives, combined with convolution and tensor
                  networks. Further, we show that this English-trained system
                  can be applied, in zero-shot learning, to other languages by
                  making surprisingly effective use of multi-lingual
                  embeddings. The proposed system has strong empirical evidence
                  yielding state-of-the-art results in English as well as
                  cross-lingual: Spanish and Chinese TAC 2015 datasets.",
  archivePrefix= "arXiv",
  eprint       = "1712.01813",
  primaryClass = "cs.CL"
}

@inproceedings{hoffart-2011-robust-el,
  title        = "Robust Disambiguation of Named Entities in Text",
  author       = "Hoffart, Johannes and Yosef, Mohamed Amir and Bordino, Ilaria
                  and F{\"u}rstenau, Hagen and Pinkal, Manfred and Spaniol,
                  Marc and Taneva, Bilyana and Thater, Stefan and Weikum,
                  Gerhard",
  booktitle    = "Proceedings of the 2011 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = jul,
  year         = 2011,
  address      = "Edinburgh, Scotland, UK.",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D11-1072",
  pages        = "782-792"
}

@inproceedings{sil-2013-re-ranking-joint-ner-el,
  author       = "Sil, Avirup and Yates, Alexander",
  title        = "Re-Ranking for Joint Named-Entity Recognition and Linking",
  year         = 2013,
  isbn         = 9781450322638,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2505515.2505601",
  doi          = "10.1145/2505515.2505601",
  booktitle    = "Proceedings of the 22nd ACM International Conference on
                  Information \& Knowledge Management",
  pages        = "2369–2374",
  numpages     = 6,
  keywords     = "named entity recognition, entity linking, entity
                  disambiguation",
  location     = "San Francisco, California, USA",
  series       = "CIKM ’13"
}

@inproceedings{guo-2013-to-link-not-to-link,
  title        = "To Link or Not to Link? A Study on End-to-End Tweet Entity
                  Linking",
  author       = "Guo, Stephen and Chang, Ming-Wei and Kiciman, Emre",
  booktitle    = "Proceedings of the 2013 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies",
  month        = jun,
  year         = 2013,
  address      = "Atlanta, Georgia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N13-1122",
  pages        = "1020-1030"
}

@inproceedings{pu-2010-structured-entities,
  author       = "Pu, Ken Q. and Hassanzadeh, Oktie and Drake, Richard and
                  Miller, Ren\'{e}e J.",
  title        = "Online Annotation of Text Streams with Structured Entities",
  year         = 2010,
  isbn         = 9781450300995,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/1871437.1871446",
  doi          = "10.1145/1871437.1871446",
  booktitle    = "Proceedings of the 19th ACM International Conference on
                  Information and Knowledge Management",
  pages        = "29–38",
  numpages     = 10,
  keywords     = "annotation, text stream, online, entity",
  location     = "Toronto, ON, Canada",
  series       = "CIKM ’10"
}

@inproceedings{zhang-2011-acronym-expansion-el,
  author       = "Zhang, Wei and Sim, Yan Chuan and Su, Jian and Tan, Chew Lim",
  title        = "Entity Linking with Effective Acronym Expansion, Instance
                  Selection and Topic Modeling",
  year         = 2011,
  isbn         = 9781577355151,
  publisher    = "AAAI Press",
  booktitle    = "Proceedings of the Twenty-Second International Joint
                  Conference on Artificial Intelligence - Volume Volume Three",
  pages        = "1909–1914",
  numpages     = 6,
  location     = "Barcelona, Catalonia, Spain",
  series       = "IJCAI’11"
}

@inproceedings{milne-2008-link-with-wiki,
  author       = "Milne, David and Witten, Ian H.",
  title        = "Learning to Link with Wikipedia",
  year         = 2008,
  isbn         = 9781595939913,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/1458082.1458150",
  doi          = "10.1145/1458082.1458150",
  booktitle    = "Proceedings of the 17th ACM Conference on Information and
                  Knowledge Management",
  pages        = "509–518",
  numpages     = 10,
  keywords     = "data mining, word sense disambiguation, wikipedia, semantic
                  annotation",
  location     = "Napa Valley, California, USA",
  series       = "CIKM ’08"
}

@inproceedings{ratinov-2011-local-global-wiki-el,
  title        = "Local and Global Algorithms for Disambiguation to
                  {W}ikipedia",
  author       = "Ratinov, Lev and Roth, Dan and Downey, Doug and Anderson,
                  Mike",
  booktitle    = "Proceedings of the 49th Annual Meeting of the Association for
                  Computational Linguistics: Human Language Technologies",
  month        = jun,
  year         = 2011,
  address      = "Portland, Oregon, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P11-1138",
  pages        = "1375-1384"
}

@inproceedings{radhakrishnan-2018-elden,
  title        = "{ELDEN}: Improved Entity Linking Using Densified Knowledge
                  Graphs",
  author       = "Radhakrishnan, Priya and Talukdar, Partha and Varma,
                  Vasudeva",
  booktitle    = "Proceedings of the 2018 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long Papers)",
  month        = jun,
  year         = 2018,
  address      = "New Orleans, Louisiana",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N18-1167",
  doi          = "10.18653/v1/N18-1167",
  pages        = "1844-1853",
  abstract     = "Entity Linking (EL) systems aim to automatically map mentions
                  of an entity in text to the corresponding entity in a
                  Knowledge Graph (KG). Degree of connectivity of an entity in
                  the KG directly affects an EL system{'}s ability to correctly
                  link mentions in text to the entity in KG. This causes many
                  EL systems to perform well for entities well connected to
                  other entities in KG, bringing into focus the role of KG
                  density in EL. In this paper, we propose Entity Linking using
                  Densified Knowledge Graphs (ELDEN). ELDEN is an EL system
                  which first densifies the KG with co-occurrence statistics
                  from a large text corpus, and then uses the densified KG to
                  train entity embeddings. Entity similarity measured using
                  these trained entity embeddings result in improved EL. ELDEN
                  outperforms state-of-the-art EL system on benchmark
                  datasets. Due to such densification, ELDEN performs well for
                  sparsely connected entities in the KG too. ELDEN{'}s approach
                  is simple, yet effective. We have made ELDEN{'}s code and
                  data publicly available."
}

@inproceedings{piccinno-2014-tagme-to-wat,
  author       = "Piccinno, Francesco and Ferragina, Paolo",
  title        = "From TagME to WAT: A New Entity Annotator",
  year         = 2014,
  isbn         = 9781450330237,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2633211.2634350",
  doi          = "10.1145/2633211.2634350",
  booktitle    = "Proceedings of the First International Workshop on Entity
                  Recognition \& Disambiguation",
  pages        = "55–62",
  numpages     = 8,
  keywords     = "graph-based algorithms, wikipedia, entity annotation, tagme",
  location     = "Gold Coast, Queensland, Australia",
  series       = "ERD ’14"
}

@inproceedings{yamada-2016-joint-learn-embedding-el,
  title        = "Joint Learning of the Embedding of Words and Entities for
                  Named Entity Disambiguation",
  author       = "Yamada, Ikuya and Shindo, Hiroyuki and Takeda, Hideaki and
                  Takefuji, Yoshiyasu",
  booktitle    = "Proceedings of The 20th {SIGNLL} Conference on Computational
                  Natural Language Learning",
  month        = aug,
  year         = 2016,
  address      = "Berlin, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/K16-1025",
  doi          = "10.18653/v1/K16-1025",
  pages        = "250-259"
}

@inproceedings{henzinger-2006-duplicate-web-pages,
  title        = "Finding near-duplicate web pages: a large-scale evaluation of
                  algorithms",
  author       = "Henzinger, Monika",
  booktitle    = "Proceedings of the 29th annual international ACM SIGIR
                  conference on Research and development in information
                  retrieval",
  pages        = "284-291",
  year         = 2006
}

@inproceedings{charikar-2002-simhash,
  title        = "Similarity estimation techniques from rounding algorithms",
  author       = "Charikar, Moses S",
  booktitle    = "Proceedings of the thiry-fourth annual ACM symposium on
                  Theory of computing",
  pages        = "380-388",
  year         = 2002
}

@article{reimers-2019-sentence-bert,
  author       = "Reimers, Nils and Gurevych, Iryna",
  title        = "Sentence-Bert: Sentence Embeddings Using Siamese
                  Bert-Networks",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1908.10084v1",
  abstract     = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has
                  set a new state-of-the-art performance on sentence-pair
                  regression tasks like semantic textual similarity
                  (STS). However, it requires that both sentences are fed into
                  the network, which causes a massive computational overhead:
                  Finding the most similar pair in a collection of 10,000
                  sentences requires about 50 million inference computations
                  (~65 hours) with BERT. The construction of BERT makes it
                  unsuitable for semantic similarity search as well as for
                  unsupervised tasks like clustering.  In this publication, we
                  present Sentence-BERT (SBERT), a modification of the
                  pretrained BERT network that use siamese and triplet network
                  structures to derive semantically meaningful sentence
                  embeddings that can be compared using cosine-similarity. This
                  reduces the effort for finding the most similar pair from 65
                  hours with BERT / RoBERTa to about 5 seconds with SBERT,
                  while maintaining the accuracy from BERT.  We evaluate SBERT
                  and SRoBERTa on common STS tasks and transfer learning tasks,
                  where it outperforms other state-of-the-art sentence
                  embeddings methods.",
  archivePrefix= "arXiv",
  eprint       = "1908.10084",
  primaryClass = "cs.CL"
}

@article{guo-2017-drmm,
  author       = "Guo, Jiafeng and Fan, Yixing and Ai, Qingyao and Croft,
                  W. Bruce",
  title        = "A Deep Relevance Matching Model for Ad-Hoc Retrieval",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.08611v1",
  abstract     = "In recent years, deep neural networks have led to exciting
                  breakthroughs in speech recognition, computer vision, and
                  natural language processing (NLP) tasks. However, there have
                  been few positive results of deep models on ad-hoc retrieval
                  tasks. This is partially due to the fact that many important
                  characteristics of the ad-hoc retrieval task have not been
                  well addressed in deep models yet. Typically, the ad-hoc
                  retrieval task is formalized as a matching problem between
                  two pieces of text in existing work using deep models, and
                  treated equivalent to many NLP tasks such as paraphrase
                  identification, question answering and automatic
                  conversation. However, we argue that the ad-hoc retrieval
                  task is mainly about relevance matching while most NLP
                  matching tasks concern semantic matching, and there are some
                  fundamental differences between these two matching
                  tasks. Successful relevance matching requires proper handling
                  of the exact matching signals, query term importance, and
                  diverse matching requirements. In this paper, we propose a
                  novel deep relevance matching model (DRMM) for ad-hoc
                  retrieval. Specifically, our model employs a joint deep
                  architecture at the query term level for relevance
                  matching. By using matching histogram mapping, a feed forward
                  matching network, and a term gating network, we can
                  effectively deal with the three relevance matching factors
                  mentioned above. Experimental results on two representative
                  benchmark collections show that our model can significantly
                  outperform some well-known retrieval models as well as
                  state-of-the-art deep matching models.",
  archivePrefix= "arXiv",
  eprint       = "1711.08611",
  primaryClass = "cs.IR"
}

@inproceedings{hui-2017-pacrr,
  title        = "{PACRR}: A Position-Aware Neural {IR} Model for Relevance
                  Matching",
  author       = "Hui, Kai and Yates, Andrew and Berberich, Klaus and de Melo,
                  Gerard",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1110",
  doi          = "10.18653/v1/D17-1110",
  pages        = "1049-1058",
  abstract     = "In order to adopt deep learning for information retrieval,
                  models are needed that can capture all relevant information
                  required to assess the relevance of a document to a given
                  user query. While previous works have successfully captured
                  unigram term matches, how to fully employ position-dependent
                  information such as proximity and term dependencies has been
                  insufficiently explored. In this work, we propose a novel
                  neural IR model named PACRR aiming at better modeling
                  position-dependent interactions between a query and a
                  document. Extensive experiments on six years{'} TREC Web
                  Track data confirm that the proposed model yields better
                  results under multiple benchmarks."
}

@article{malkov-2016-hnsw,
  author       = "Malkov, Yu. A. and Yashunin, D. A.",
  title        = "Efficient and Robust Approximate Nearest Neighbor Search
                  Using Hierarchical Navigable Small World Graphs",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1603.09320v4",
  abstract     = "We present a new approach for the approximate K-nearest
                  neighbor search based on navigable small world graphs with
                  controllable hierarchy (Hierarchical NSW, HNSW). The proposed
                  solution is fully graph-based, without any need for
                  additional search structures, which are typically used at the
                  coarse search stage of the most proximity graph
                  techniques. Hierarchical NSW incrementally builds a
                  multi-layer structure consisting from hierarchical set of
                  proximity graphs (layers) for nested subsets of the stored
                  elements. The maximum layer in which an element is present is
                  selected randomly with an exponentially decaying probability
                  distribution. This allows producing graphs similar to the
                  previously studied Navigable Small World (NSW) structures
                  while additionally having the links separated by their
                  characteristic distance scales. Starting search from the
                  upper layer together with utilizing the scale separation
                  boosts the performance compared to NSW and allows a
                  logarithmic complexity scaling.  Additional employment of a
                  heuristic for selecting proximity graph neighbors
                  significantly increases performance at high recall and in
                  case of highly clustered data. Performance evaluation has
                  demonstrated that the proposed general metric space search
                  index is able to strongly outperform previous opensource
                  state-of-the-art vector-only approaches. Similarity of the
                  algorithm to the skip list structure allows straightforward
                  balanced distributed implementation.",
  archivePrefix= "arXiv",
  eprint       = "1603.09320",
  primaryClass = "cs.DS"
}

@article{liu-2009-learning-to-rank,
  title        = "Learning to rank for information retrieval",
  author       = "Liu, Tie-Yan",
  journal      = "Foundations and trends in information retrieval",
  volume       = 3,
  number       = 3,
  pages        = "225-331",
  year         = 2009,
  publisher    = "Now Publishers Inc."
}

@article{marrero-2013-survey-ner,
  author       = "Marrero, M\'{o}nica and Urbano, Juli\'{a}n and
                  S\'{a}nchez-Cuadrado, Sonia and Morato, Jorge and
                  G\'{o}mez-Berb\'{\i}s, Juan Miguel",
  journal      = "Computer Standards \& Interfaces",
  number       = 5,
  pages        = "482-489",
  title        = "{Named Entity Recognition: Fallacies, Challenges and
                  Opportunities}",
  volume       = 35,
  year         = 2013
}

@inproceedings{dai-2018-complex-entity,
  title        = "Recognizing Complex Entity Mentions: A Review and Future
                  Directions",
  author       = "Dai, Xiang",
  booktitle    = "Proceedings of {ACL} 2018, Student Research Workshop",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-3006",
  doi          = "10.18653/v1/P18-3006",
  pages        = "37-44",
  abstract     = "Standard named entity recognizers can effectively recognize
                  entity mentions that consist of contiguous tokens and do not
                  overlap with each other. However, in practice, there are many
                  domains, such as the biomedical domain, in which there are
                  nested, overlapping, and discontinuous entity mentions. These
                  complex mentions cannot be directly recognized by
                  conventional sequence tagging models because they may break
                  the assumptions based on which sequence tagging techniques
                  are built. We review the existing methods which are revised
                  to tackle complex entity mentions and categorize them as
                  tokenlevel and sentence-level approaches. We then identify
                  the research gap, and discuss some directions that we are
                  exploring."
}

@article{goyal-2018-surney-ner,
  title        = "Recent Named Entity Recognition and Classification
                  techniques: A systematic review",
  journal      = "Computer Science Review",
  volume       = 29,
  pages        = "21-43",
  year         = 2018,
  issn         = "1574-0137",
  doi          = "https://doi.org/10.1016/j.cosrev.2018.06.001",
  url          =
                  "http://www.sciencedirect.com/science/article/pii/S1574013717302782",
  author       = "Archana Goyal and Vishal Gupta and Manish Kumar",
  abstract     = "Textual information is becoming available in abundance on the
                  web, arising the requirement of techniques and tools to
                  extract the meaningful information. One of such an important
                  information extraction task is Named Entity Recognition and
                  Classification. It is the problem of finding the members of
                  various predetermined classes, such as person, organization,
                  location, date/time, quantities, numbers etc. The concept of
                  named entity extraction was first proposed in Sixth Message
                  Understanding Conference in 1996. Since then, a number of
                  techniques have been developed by many researchers for
                  extracting diversity of entities from different languages and
                  genres of text. Still, there is a growing interest among
                  research community to develop more new approaches to extract
                  diverse named entities which are helpful in various natural
                  language applications. Here we present a survey of
                  developments and progresses made in Named Entity Recognition
                  and Classification research."
}

@article{wang-2018-sv-guided-softmax,
  author       = "Wang, Xiaobo and Wang, Shuo and Zhang, Shifeng and Fu, Tianyu
                  and Shi, Hailin and Mei, Tao",
  title        = "Support Vector Guided Softmax Loss for Face Recognition",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1812.11317v1",
  abstract     = "Face recognition has witnessed significant progresses due to
                  the advances of deep convolutional neural networks (CNNs),
                  the central challenge of which, is feature discrimination. To
                  address it, one group tries to exploit mining-based
                  strategies (\textit{e.g.}, hard example mining and focal
                  loss) to focus on the informative examples. The other group
                  devotes to designing margin-based loss functions
                  (\textit{e.g.}, angular, additive and additive angular
                  margins) to increase the feature margin from the perspective
                  of ground truth class. Both of them have been well-verified
                  to learn discriminative features. However, they suffer from
                  either the ambiguity of hard examples or the lack of
                  discriminative power of other classes. In this paper, we
                  design a novel loss function, namely support vector guided
                  softmax loss (SV-Softmax), which adaptively emphasizes the
                  mis-classified points (support vectors) to guide the
                  discriminative features learning. So the developed SV-Softmax
                  loss is able to eliminate the ambiguity of hard examples as
                  well as absorb the discriminative power of other classes, and
                  thus results in more discrimiantive features. To the best of
                  our knowledge, this is the first attempt to inherit the
                  advantages of mining-based and margin-based losses into one
                  framework. Experimental results on several benchmarks have
                  demonstrated the effectiveness of our approach over
                  state-of-the-arts.",
  archivePrefix= "arXiv",
  eprint       = "1812.11317",
  primaryClass = "cs.CV"
}

@inproceedings{pan-2015-unsupervised-el,
  title        = "Unsupervised Entity Linking with {A}bstract {M}eaning
                  {R}epresentation",
  author       = "Pan, Xiaoman and Cassidy, Taylor and Hermjakob, Ulf and Ji,
                  Heng and Knight, Kevin",
  booktitle    = "Proceedings of the 2015 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies",
  month        = may # "{--}" # jun,
  year         = 2015,
  address      = "Denver, Colorado",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N15-1119",
  doi          = "10.3115/v1/N15-1119",
  pages        = "1130-1139"
}

@inproceedings{banarescu-2013-amr,
  title        = "{A}bstract {M}eaning {R}epresentation for Sembanking",
  author       = "Banarescu, Laura and Bonial, Claire and Cai, Shu and
                  Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and
                  Knight, Kevin and Koehn, Philipp and Palmer, Martha and
                  Schneider, Nathan",
  booktitle    = "Proceedings of the 7th Linguistic Annotation Workshop and
                  Interoperability with Discourse",
  month        = aug,
  year         = 2013,
  address      = "Sofia, Bulgaria",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W13-2322",
  pages        = "178-186"
}

@article{wang-2015-faq-based,
  author       = "Wang, Zhiguo and Ittycheriah, Abraham",
  title        = "Faq-Based Question Answering Via Word Alignment",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1507.02628v1",
  abstract     = "In this paper, we propose a novel word-alignment-based method
                  to solve the FAQ-based question answering task. First, we
                  employ a neural network model to calculate question
                  similarity, where the word alignment between two questions is
                  used for extracting features. Second, we design a
                  bootstrap-based feature extraction method to extract a small
                  set of effective lexical features. Third, we propose a
                  learning-to-rank algorithm to train parameters more suitable
                  for the ranking tasks. Experimental results, conducted on
                  three languages (English, Spanish and Japanese), demonstrate
                  that the question similarity model is more effective than
                  baseline systems, the sparse features bring 5 \% improvements
                  on top-1 accuracy, and the learning-to-rank algorithm works
                  significantly better than the traditional method. We further
                  evaluate our method on the answer sentence selection
                  task. Our method outperforms all the previous systems on the
                  standard TREC data set.",
  archivePrefix= "arXiv",
  eprint       = "1507.02628",
  primaryClass = "cs.CL"
}

@inproceedings{song-2007-question-similarity,
  title        = "Question similarity calculation for FAQ answering",
  author       = "Song, Wanpeng and Feng, Min and Gu, Naijie and Wenyin, Liu",
  booktitle    = "Third International Conference on Semantics, Knowledge and
                  Grid (SKG 2007)",
  pages        = "298-301",
  year         = 2007,
  organization = "IEEE"
}

@inproceedings{bhardwaj-2016-faq,
  title        = "Question answering system for frequently asked questions",
  author       = "Bhardwaj, Divyanshu and Pakray, Partha and Bentham, Jereemi
                  and Saha, Saurav and Mizoram, NIT and Gelbukh, Alexander",
  booktitle    = "of the Final Workshop 7 December 2016, Naples",
  pages        = 129,
  year         = 2016
}

@article{minaee-2017-similarity-qa,
  author       = "Minaee, Shervin and Liu, Zhu",
  title        = "Automatic Question-Answering Using a Deep Similarity Neural
                  Network",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.01713v1",
  abstract     = "Automatic question-answering is a classical problem in
                  natural language processing, which aims at designing systems
                  that can automatically answer a question, in the same way as
                  human does. In this work, we propose a deep learning based
                  model for automatic question-answering. First the questions
                  and answers are embedded using neural probabilistic
                  modeling. Then a deep similarity neural network is trained to
                  find the similarity score of a pair of answer and
                  question. Then for each question, the best answer is found as
                  the one with the highest similarity score. We first train
                  this model on a large-scale public question-answering
                  database, and then fine-tune it to transfer to the
                  customer-care chat data. We have also tested our framework on
                  a public question-answering database and achieved very good
                  performance.",
  archivePrefix= "arXiv",
  eprint       = "1708.01713",
  primaryClass = "cs.CL"
}

@article{sharma-2018-qa-system,
  title        = "Deep Learning Approaches for Question Answering System",
  journal      = "Procedia Computer Science",
  volume       = 132,
  pages        = "785-794",
  year         = 2018,
  note         = "International Conference on Computational Intelligence and
                  Data Science",
  issn         = "1877-0509",
  doi          = "https://doi.org/10.1016/j.procs.2018.05.090",
  url          =
                  "http://www.sciencedirect.com/science/article/pii/S1877050918308226",
  author       = "Yashvardhan Sharma and Sahil Gupta",
  keywords     = "coattention, deep learning, memory nets, neural networks,
                  question answering, word vectors",
  abstract     = "Question Answering (QA) System is very useful as most of the
                  deep learning related problems can be modeled as a question
                  answering problem. Consequently, the field is one of the most
                  researched fields in computer science today. The last few
                  years have seen considerable developments and improvement in
                  the state of the art, much of which can be credited to
                  upcoming of Deep Learning. In this paper, a discussion about
                  various approaches starting from the basic NLP and algorithms
                  based approach has been done and the paper eventually builds
                  towards the recently proposed methods of Deep
                  Learning. Implementation details and various tweaks in the
                  algorithms that produced better results have also been
                  discussed. The evaluation of the proposed models was done on
                  twenty tasks of babI dataset of Facebook."
}

@inproceedings{lai-2018-answer-selection,
  title        = "A Review on Deep Learning Techniques Applied to Answer
                  Selection",
  author       = "Lai, Tuan Manh and Bui, Trung and Li, Sheng",
  booktitle    = "Proceedings of the 27th International Conference on
                  Computational Linguistics",
  month        = aug,
  year         = 2018,
  address      = "Santa Fe, New Mexico, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/C18-1181",
  pages        = "2132-2144",
  abstract     = "Given a question and a set of candidate answers, answer
                  selection is the task of identifying which of the candidates
                  answers the question correctly. It is an important problem in
                  natural language processing, with applications in many
                  areas. Recently, many deep learning based methods have been
                  proposed for the task. They produce impressive performance
                  without relying on any feature engineering or expensive
                  external resources. In this paper, we aim to provide a
                  comprehensive review on deep learning methods applied to
                  answer selection."
}

@article{feng-2015-answer-selection,
  author       = "Feng, Minwei and Xiang, Bing and Glass, Michael R. and Wang,
                  Lidan and Zhou, Bowen",
  title        = "Applying Deep Learning To Answer Selection: a Study and an
                  Open Task",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1508.01585v2",
  abstract     = "We apply a general deep learning framework to address the
                  non-factoid question answering task. Our approach does not
                  rely on any linguistic tools and can be applied to different
                  languages or domains. Various architectures are presented and
                  compared. We create and release a QA corpus and setup a new
                  QA task in the insurance domain. Experimental results
                  demonstrate superior performance compared to the baseline
                  methods and various technologies give further
                  improvements. For this highly challenging task, the top-1
                  accuracy can reach up to 65.3 \% on a test set, which
                  indicates a great potential for practical use.",
  archivePrefix= "arXiv",
  eprint       = "1508.01585",
  primaryClass = "cs.CL"
}

@article{tan-2015-lstm-answer-selection,
  author       = "Tan, Ming and Santos, Cicero dos and Xiang, Bing and Zhou,
                  Bowen",
  title        = "Lstm-Based Deep Learning Models for Non-Factoid Answer
                  Selection",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1511.04108v4",
  abstract     = "In this paper, we apply a general deep learning (DL)
                  framework for the answer selection task, which does not
                  depend on manually defined features or linguistic tools. The
                  basic framework is to build the embeddings of questions and
                  answers based on bidirectional long short-term memory
                  (biLSTM) models, and measure their closeness by cosine
                  similarity. We further extend this basic model in two
                  directions. One direction is to define a more composite
                  representation for questions and answers by combining
                  convolutional neural network with the basic framework. The
                  other direction is to utilize a simple but efficient
                  attention mechanism in order to generate the answer
                  representation according to the question context. Several
                  variations of models are provided. The models are examined by
                  two datasets, including TREC-QA and InsuranceQA. Experimental
                  results demonstrate that the proposed models substantially
                  outperform several strong baselines.",
  archivePrefix= "arXiv",
  eprint       = "1511.04108",
  primaryClass = "cs.CL"
}

@inproceedings{wang-2016-inner-attention-answer-selection,
  title        = "Inner Attention based Recurrent Neural Networks for Answer
                  Selection",
  author       = "Wang, Bingning and Liu, Kang and Zhao, Jun",
  booktitle    = "Proceedings of the 54th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = aug,
  year         = 2016,
  address      = "Berlin, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P16-1122",
  doi          = "10.18653/v1/P16-1122",
  pages        = "1288-1297"
}

@article{wang-2016-compare-aggregate,
  author       = "Wang, Shuohang and Jiang, Jing",
  title        = "A Compare-Aggregate Model for Matching Text Sequences",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.01747v1",
  abstract     = "Many NLP tasks including machine comprehension, answer
                  selection and text entailment require the comparison between
                  sequences. Matching the important units between sequences is
                  a key to solve these problems. In this paper, we present a
                  general ``compare-aggregate`` framework that performs
                  word-level matching followed by aggregation using
                  Convolutional Neural Networks. We particularly focus on the
                  different comparison functions we can use to match two
                  vectors. We use four different datasets to evaluate the
                  model. We find that some simple comparison functions based on
                  element-wise operations can work better than standard neural
                  network and neural tensor network.",
  archivePrefix= "arXiv",
  eprint       = "1611.01747",
  primaryClass = "cs.CL"
}

@ARTICLE{parikh-2016-decomposable-attention,
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System",
  adsurl       = "http://adsabs.harvard.edu/abs/2016arXiv160601933P",
  archivePrefix= "arXiv",
  author       = "{Parikh}, A.~P. and {T{\"a}ckstr{\"o}m}, O. and {Das}, D. and
                  {Uszkoreit}, J.",
  eprint       = "1606.01933",
  journal      = "ArXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  month        = jun,
  primaryClass = "cs.CL",
  title        = "{A Decomposable Attention Model for Natural Language
                  Inference}",
  year         = 2016
}

@article{wang-2017-bimpm,
  author       = "Zhiguo Wang and Wael Hamza and Radu Florian",
  title        = "Bilateral Multi-Perspective Matching for Natural Language
                  Sentences",
  journal      = "CoRR",
  volume       = "abs/1702.03814",
  year         = 2017,
  url          = "http://arxiv.org/abs/1702.03814",
  archivePrefix= "arXiv",
  eprint       = "1702.03814",
  timestamp    = "Mon, 13 Aug 2018 16:47:19 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/WangHF17",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@inproceedings{wang-2016-lexical-decomposition-composition,
  title        = "Sentence Similarity Learning by Lexical Decomposition and
                  Composition",
  author       = "Wang, Zhiguo and Mi, Haitao and Ittycheriah, Abraham",
  booktitle    = "Proceedings of {COLING} 2016, the 26th International
                  Conference on Computational Linguistics: Technical Papers",
  month        = dec,
  year         = 2016,
  address      = "Osaka, Japan",
  publisher    = "The COLING 2016 Organizing Committee",
  url          = "https://www.aclweb.org/anthology/C16-1127",
  pages        = "1340-1349",
  abstract     = "Most conventional sentence similarity methods only focus on
                  similar parts of two input sentences, and simply ignore the
                  dissimilar parts, which usually give us some clues and
                  semantic meanings about the sentences. In this work, we
                  propose a model to take into account both the similarities
                  and dissimilarities by decomposing and composing lexical
                  semantics over sentences. The model represents each word as a
                  vector, and calculates a semantic matching vector for each
                  word based on all words in the other sentence. Then, each
                  word vector is decomposed into a similar component and a
                  dissimilar component based on the semantic matching
                  vector. After this, a two-channel CNN model is employed to
                  capture features by composing the similar and dissimilar
                  components. Finally, a similarity score is estimated over the
                  composed feature vectors. Experimental results show that our
                  model gets the state-of-the-art performance on the answer
                  sentence selection task, and achieves a comparable result on
                  the paraphrase identification task."
}

@ARTICLE{chen-2016-esim,
  author       = "{Chen}, Qian and {Zhu}, Xiaodan and {Ling}, Zhenhua and
                  {Wei}, Si and {Jiang}, Hui and {Inkpen}, Diana",
  title        = "{Enhanced LSTM for Natural Language Inference}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Computation and Language",
  year         = 2016,
  month        = sep,
  eid          = "arXiv:1609.06038",
  pages        = "arXiv:1609.06038",
  archivePrefix= "arXiv",
  eprint       = "1609.06038",
  primaryClass = "cs.CL",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2016arXiv160906038C",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{shen-2017-inter-weighted-alignment,
  title        = "Inter-Weighted Alignment Network for Sentence Pair Modeling",
  author       = "Shen, Gehui and Yang, Yunlun and Deng, Zhi-Hong",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1122",
  doi          = "10.18653/v1/D17-1122",
  pages        = "1179-1189",
  abstract     = "Sentence pair modeling is a crucial problem in the field of
                  natural language processing. In this paper, we propose a
                  model to measure the similarity of a sentence pair focusing
                  on the interaction information. We utilize the word level
                  similarity matrix to discover fine-grained alignment of two
                  sentences. It should be emphasized that each word in a
                  sentence has a different importance from the perspective of
                  semantic composition, so we exploit two novel and efficient
                  strategies to explicitly calculate a weight for each
                  word. Although the proposed model only use a sequential LSTM
                  for sentence modeling without any external resource such as
                  syntactic parser tree and additional lexicon features,
                  experimental results show that our model achieves
                  state-of-the-art performance on three datasets of two tasks."
}

@article{tay-2017-compare-compress-propagate,
  author       = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
  title        = "Compare, Compress and Propagate: Enhancing Neural
                  Architectures With Alignment Factorization for Natural
                  Language Inference",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1801.00102v2",
  abstract     = "This paper presents a new deep learning architecture for
                  Natural Language Inference (NLI). Firstly, we introduce a new
                  architecture where alignment pairs are compared, compressed
                  and then propagated to upper layers for enhanced
                  representation learning. Secondly, we adopt factorization
                  layers for efficient and expressive compression of alignment
                  vectors into scalar features, which are then used to augment
                  the base word representations. The design of our approach is
                  aimed to be conceptually simple, compact and yet powerful. We
                  conduct experiments on three popular benchmarks, SNLI,
                  MultiNLI and SciTail, achieving competitive performance on
                  all. A lightweight parameterization of our model also enjoys
                  a $\approx 3$ times reduction in parameter size compared to
                  the existing state-of-the-art models, e.g., ESIM and DIIN,
                  while maintaining competitive performance. Additionally,
                  visual analysis shows that our propagated features are highly
                  interpretable.",
  archivePrefix= "arXiv",
  eprint       = "1801.00102",
  primaryClass = "cs.CL"
}

@article{gong-2017-diin,
  author       = "Yichen Gong and Heng Luo and Jian Zhang",
  title        = "Natural Language Inference over Interaction Space",
  journal      = "CoRR",
  volume       = "abs/1709.04348",
  year         = 2017,
  url          = "http://arxiv.org/abs/1709.04348",
  archivePrefix= "arXiv",
  eprint       = "1709.04348",
  timestamp    = "Mon, 13 Aug 2018 16:47:34 +0200",
  biburl       = "https://dblp.org/rec/bib/journals/corr/abs-1709-04348",
  bibsource    = "dblp computer science bibliography, https://dblp.org"
}

@article{tay-2018-multi-cast-attention,
  author       = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
  title        = "Multi-Cast Attention Networks for Retrieval-Based Question
                  Answering and Response Prediction",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1806.00778v1",
  abstract     = "Attention is typically used to select informative sub-phrases
                  that are used for prediction. This paper investigates the
                  novel use of attention as a form of feature augmentation,
                  i.e, casted attention. We propose Multi-Cast Attention
                  Networks (MCAN), a new attention mechanism and general model
                  architecture for a potpourri of ranking tasks in the
                  conversational modeling and question answering domains. Our
                  approach performs a series of soft attention operations, each
                  time casting a scalar feature upon the inner word
                  embeddings. The key idea is to provide a real-valued hint
                  (feature) to a subsequent encoder layer and is targeted at
                  improving the representation learning process. There are
                  several advantages to this design, e.g., it allows an
                  arbitrary number of attention mechanisms to be casted,
                  allowing for multiple attention types (e.g., co-attention,
                  intra-attention) and attention variants (e.g.,
                  alignment-pooling, max-pooling, mean-pooling) to be executed
                  simultaneously. This not only eliminates the costly need to
                  tune the nature of the co-attention layer, but also provides
                  greater extents of explainability to practitioners. Via
                  extensive experiments on four well-known benchmark datasets,
                  we show that MCAN achieves state-of-the-art performance. On
                  the Ubuntu Dialogue Corpus, MCAN outperforms existing
                  state-of-the-art models by $9\%$. MCAN also achieves the best
                  performing score to date on the well-studied TrecQA dataset.",
  archivePrefix= "arXiv",
  eprint       = "1806.00778",
  primaryClass = "cs.CL"
}

@inproceedings{tay-2018-csran,
  title        = "Co-Stack Residual Affinity Networks with Multi-level
                  Attention Refinement for Matching Text Sequences",
  author       = "Tay, Yi and Luu, Anh Tuan and Hui, Siu Cheung",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1479",
  doi          = "10.18653/v1/D18-1479",
  pages        = "4492-4502",
  abstract     = "Learning a matching function between two text sequences is a
                  long standing problem in NLP research. This task enables many
                  potential applications such as question answering and
                  paraphrase identification. This paper proposes Co-Stack
                  Residual Affinity Networks (CSRAN), a new and universal
                  neural architecture for this problem. CSRAN is a deep
                  architecture, involving stacked (multi-layered) recurrent
                  encoders. Stacked/Deep architectures are traditionally
                  difficult to train, due to the inherent weaknesses such as
                  difficulty with feature propagation and vanishing
                  gradients. CSRAN incorporates two novel components to take
                  advantage of the stacked architecture. Firstly, it introduces
                  a new bidirectional alignment mechanism that learns affinity
                  weights by fusing sequence pairs across stacked
                  hierarchies. Secondly, it leverages a multi-level attention
                  refinement component between stacked recurrent layers. The
                  key intuition is that, by leveraging information across all
                  network hierarchies, we can not only improve gradient flow
                  but also improve overall performance. We conduct extensive
                  experiments on six well-studied text sequence matching
                  datasets, achieving state-of-the-art performance on all."
}

@inproceedings{tan-2018-multiway-attention-mwan,
  title        = "Multiway Attention Networks for Modeling Sentence Pairs",
  author       = "Chuanqi Tan and Furu Wei and Wenhui Wang and Weifeng Lv and
                  Ming Zhou",
  booktitle    = "Proceedings of the Twenty-Seventh International Joint
                  Conference on Artificial Intelligence, {IJCAI-18}",
  publisher    = "International Joint Conferences on Artificial Intelligence
                  Organization",
  pages        = "4411-4417",
  year         = 2018,
  month        = 7,
  doi          = "10.24963/ijcai.2018/613",
  url          = "https://doi.org/10.24963/ijcai.2018/613"
}

@article{kim-2018-semantic-sentence-matching,
  author       = "Kim, Seonhoon and Kang, Inho and Kwak, Nojun",
  title        = "Semantic Sentence Matching With Densely-Connected Recurrent
                  and Co-Attentive Information",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1805.11360v2",
  abstract     = "Sentence matching is widely used in various natural language
                  tasks such as natural language inference, paraphrase
                  identification, and question answering.  For these tasks,
                  understanding logical and semantic relationship between two
                  sentences is required but it is yet challenging. Although
                  attention mechanism is useful to capture the semantic
                  relationship and to properly align the elements of two
                  sentences, previous methods of attention mechanism simply use
                  a summation operation which does not retain original features
                  enough. Inspired by DenseNet, a densely connected
                  convolutional network, we propose a densely-connected
                  co-attentive recurrent neural network, each layer of which
                  uses concatenated information of attentive features as well
                  as hidden features of all the preceding recurrent layers. It
                  enables preserving the original and the co-attentive feature
                  information from the bottommost word embedding layer to the
                  uppermost recurrent layer. To alleviate the problem of an
                  ever-increasing size of feature vectors due to dense
                  concatenation operations, we also propose to use an
                  autoencoder after dense concatenation. We evaluate our
                  proposed architecture on highly competitive benchmark
                  datasets related to sentence matching. Experimental results
                  show that our architecture, which retains recurrent and
                  attentive features, achieves state-of-the-art performances
                  for most of the tasks.",
  archivePrefix= "arXiv",
  eprint       = "1805.11360",
  primaryClass = "cs.CL"
}

@inproceedings{pan-2018-discourse-marker,
  title        = "Discourse Marker Augmented Network with Reinforcement
                  Learning for Natural Language Inference",
  author       = "Pan, Boyuan and Yang, Yazheng and Zhao, Zhou and Zhuang,
                  Yueting and Cai, Deng and He, Xiaofei",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1091",
  doi          = "10.18653/v1/P18-1091",
  pages        = "989-999",
  abstract     = "Natural Language Inference (NLI), also known as Recognizing
                  Textual Entailment (RTE), is one of the most important
                  problems in natural language processing. It requires to infer
                  the logical relationship between two given sentences. While
                  current approaches mostly focus on the interaction
                  architectures of the sentences, in this paper, we propose to
                  transfer knowledge from some important discourse markers to
                  augment the quality of the NLI model. We observe that people
                  usually use some discourse markers such as {``}so{''} or
                  {``}but{''} to represent the logical relationship between two
                  sentences. These words potentially have deep connections with
                  the meanings of the sentences, thus can be utilized to help
                  improve the representations of them. Moreover, we use
                  reinforcement learning to optimize a new objective function
                  with a reward defined by the property of the NLI datasets to
                  make full use of the labels information. Experiments show
                  that our method achieves the state-of-the-art performance on
                  several large-scale datasets."
}

@article{zhang-2018-explicit-contextual-semantics,
  author       = "Zhang, Zhuosheng and Wu, Yuwei and Li, Zuchao and Zhao, Hai",
  title        = "Explicit Contextual Semantics for Text Comprehension",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1809.02794v3",
  abstract     = "Who did what to whom is a major focus in natural language
                  understanding, which is right the aim of semantic role
                  labeling (SRL) task. Despite of sharing a lot of processing
                  characteristics and even task purpose, it is surprisingly
                  that jointly considering these two related tasks was never
                  formally reported in previous work. Thus this paper makes the
                  first attempt to let SRL enhance text comprehension and
                  inference through specifying verbal predicates and their
                  corresponding semantic roles. In terms of deep learning
                  models, our embeddings are enhanced by explicit contextual
                  semantic role labels for more fine-grained semantics. We show
                  that the salient labels can be conveniently added to existing
                  models and significantly improve deep learning models in
                  challenging text comprehension tasks. Extensive experiments
                  on benchmark machine reading comprehension and inference
                  datasets verify that the proposed semantic learning helps our
                  system reach new state-of-the-art over strong baselines which
                  have been enhanced by well pretrained language models from
                  the latest progress.",
  archivePrefix= "arXiv",
  eprint       = "1809.02794",
  primaryClass = "cs.CL"
}

@article{leal-taixe-2016-siamese-cnn,
  author       = "Leal-Taix{\'e}, Laura and Ferrer, Cristian Canton and
                  Schindler, Konrad",
  title        = "Learning By Tracking: Siamese Cnn for Robust Target
                  Association",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1604.07866v3",
  abstract     = "This paper introduces a novel approach to the task of data
                  association within the context of pedestrian tracking, by
                  introducing a two-stage learning scheme to match pairs of
                  detections. First, a Siamese convolutional neural network
                  (CNN) is trained to learn descriptors encoding local
                  spatio-temporal structures between the two input image
                  patches, aggregating pixel values and optical flow
                  information. Second, a set of contextual features derived
                  from the position and size of the compared input patches are
                  combined with the CNN output by means of a gradient boosting
                  classifier to generate the final matching probability. This
                  learning approach is validated by using a linear programming
                  based multi-person tracker showing that even a simple and
                  efficient tracker may outperform much more complex models
                  when fed with our learned matching probabilities. Results on
                  publicly available sequences show that our method meets
                  state-of-the-art standards in multiple people tracking.",
  archivePrefix= "arXiv",
  eprint       = "1604.07866",
  primaryClass = "cs.LG"
}

@inproceedings{mueller-2016-siamese-lstm,
  title        = "Siamese recurrent architectures for learning sentence
                  similarity",
  author       = "Mueller, Jonas and Thyagarajan, Aditya",
  booktitle    = "thirtieth AAAI conference on artificial intelligence",
  year         = 2016
}

@article{conneau-2017-infer-sent,
  author       = "Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and
                  Barrault, Loic and Bordes, Antoine",
  title        = "Supervised Learning of Universal Sentence Representations
                  From Natural Language Inference Data",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1705.02364v5",
  abstract     = "Many modern NLP systems rely on word embeddings, previously
                  trained in an unsupervised manner on large corpora, as base
                  features. Efforts to obtain embeddings for larger chunks of
                  text, such as sentences, have however not been so
                  successful. Several attempts at learning unsupervised
                  representations of sentences have not reached satisfactory
                  enough performance to be widely adopted. In this paper, we
                  show how universal sentence representations trained using the
                  supervised data of the Stanford Natural Language Inference
                  datasets can consistently outperform unsupervised methods
                  like SkipThought vectors on a wide range of transfer
                  tasks. Much like how computer vision uses ImageNet to obtain
                  features, which can then be transferred to other tasks, our
                  work tends to indicate the suitability of naturajl language
                  inference for transfer learning to other NLP tasks. Our
                  encoder is publicly available.",
  archivePrefix= "arXiv",
  eprint       = "1705.02364",
  primaryClass = "cs.CL"
}

@article{nie-2017-sse,
  author       = "Nie, Yixin and Bansal, Mohit",
  title        = "Shortcut-Stacked Sentence Encoders for Multi-Domain
                  Inference",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1708.02312v2",
  abstract     = "We present a simple sequential sentence encoder for
                  multi-domain natural language inference. Our encoder is based
                  on stacked bidirectional LSTM-RNNs with shortcut connections
                  and fine-tuning of word embeddings. The overall supervised
                  model uses the above encoder to encode two input sentences
                  into two vectors, and then uses a classifier over the vector
                  combination to label the relationship between these two
                  sentences as that of entailment, contradiction, or
                  neural. Our Shortcut-Stacked sentence encoders achieve strong
                  improvements over existing encoders on matched and mismatched
                  multi-domain natural language inference (top non-ensemble
                  single-model result in the EMNLP RepEval 2017 Shared Task
                  (Nangia et al., 2017)). Moreover, they achieve the new
                  state-of-the-art encoding result on the original SNLI dataset
                  (Bowman et al., 2015).",
  archivePrefix= "arXiv",
  eprint       = "1708.02312",
  primaryClass = "cs.CL"
}

@inproceedings{zhou-2016-multi-view,
  title        = "Multi-view Response Selection for Human-Computer
                  Conversation",
  author       = "Zhou, Xiangyang and Dong, Daxiang and Wu, Hua and Zhao, Shiqi
                  and Yu, Dianhai and Tian, Hao and Liu, Xuan and Yan, Rui",
  booktitle    = "Proceedings of the 2016 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = nov,
  year         = 2016,
  address      = "Austin, Texas",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D16-1036",
  doi          = "10.18653/v1/D16-1036",
  pages        = "372-381"
}

@inproceedings{lan-2018-neural-network,
  title        = "Neural Network Models for Paraphrase Identification, Semantic
                  Textual Similarity, Natural Language Inference, and Question
                  Answering",
  author       = "Lan, Wuwei and Xu, Wei",
  booktitle    = "Proceedings of the 27th International Conference on
                  Computational Linguistics",
  month        = aug,
  year         = 2018,
  address      = "Santa Fe, New Mexico, USA",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/C18-1328",
  pages        = "3890-3902",
  abstract     = "In this paper, we analyze several neural network designs (and
                  their variations) for sentence pair modeling and compare
                  their performance extensively across eight datasets,
                  including paraphrase identification, semantic textual
                  similarity, natural language inference, and question
                  answering tasks. Although most of these models have claimed
                  state-of-the-art performance, the original papers often
                  reported on only one or two selected datasets. We provide a
                  systematic study and show that (i) encoding contextual
                  information by LSTM and inter-sentence interactions are
                  critical, (ii) Tree-LSTM does not help as much as previously
                  claimed but surprisingly improves performance on Twitter
                  datasets, (iii) the Enhanced Sequential Inference Model is
                  the best so far for larger datasets, while the Pairwise Word
                  Interaction Model achieves the best performance when less
                  data is available. We release our implementations as an
                  open-source toolkit."
}

@inproceedings{zhou-2018-dam,
  title        = "Multi-Turn Response Selection for Chatbots with Deep
                  Attention Matching Network",
  author       = "Zhou, Xiangyang and Li, Lu and Dong, Daxiang and Liu, Yi and
                  Chen, Ying and Zhao, Wayne Xin and Yu, Dianhai and Wu, Hua",
  booktitle    = "Proceedings of the 56th Annual Meeting of the Association for
                  Computational Linguistics (Volume 1: Long Papers)",
  month        = jul,
  year         = 2018,
  address      = "Melbourne, Australia",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P18-1103",
  doi          = "10.18653/v1/P18-1103",
  pages        = "1118-1127",
  abstract     = "Human generates responses relying on semantic and functional
                  dependencies, including coreference relation, among dialogue
                  elements and their context. In this paper, we investigate
                  matching a response with its multi-turn context using
                  dependency information based entirely on attention. Our
                  solution is inspired by the recently proposed Transformer in
                  machine translation (Vaswani et al., 2017) and we extend the
                  attention mechanism in two ways. First, we construct
                  representations of text segments at different granularities
                  solely with stacked self-attention. Second, we try to extract
                  the truly matched segment pairs with attention across the
                  context and response. We jointly introduce those two kinds of
                  attention in one uniform neural network. Experiments on two
                  large-scale multi-turn response selection tasks show that our
                  proposed model significantly outperforms the state-of-the-art
                  models."
}

@inproceedings{rao-2019-hcan,
  title        = "Bridging the Gap between Relevance Matching and Semantic
                  Matching for Short Text Similarity Modeling",
  author       = "Rao, Jinfeng and Liu, Linqing and Tay, Yi and Yang, Wei and
                  Shi, Peng and Lin, Jimmy",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1540",
  doi          = "10.18653/v1/D19-1540",
  pages        = "5370-5381",
  abstract     = "A core problem of information retrieval (IR) is relevance
                  matching, which is to rank documents by relevance to a
                  user{'}s query. On the other hand, many NLP problems, such as
                  question answering and paraphrase identification, can be
                  considered variants of semantic matching, which is to measure
                  the semantic distance between two pieces of short
                  texts. While at a high level both relevance and semantic
                  matching require modeling textual similarity, many existing
                  techniques for one cannot be easily adapted to the other. To
                  bridge this gap, we propose a novel model, HCAN (Hybrid
                  Co-Attention Network), that comprises (1) a hybrid encoder
                  module that includes ConvNet-based and LSTM-based encoders,
                  (2) a relevance matching module that measures soft term
                  matches with importance weighting at multiple granularities,
                  and (3) a semantic matching module with co-attention
                  mechanisms that capture context-aware semantic
                  relatedness. Evaluations on multiple IR and NLP benchmarks
                  demonstrate state-of-the-art effectiveness compared to
                  approaches that do not exploit pretraining on external
                  data. Extensive ablation studies suggest that relevance and
                  semantic matching signals are complementary across many
                  problem settings, regardless of the choice of underlying
                  encoders."
}

@article{cui-2018-cmrc-2018,
  author       = "Cui, Yiming and Liu, Ting and Che, Wanxiang and Xiao, Li and
                  Chen, Zhipeng and Ma, Wentao and Wang, Shijin and Hu,
                  Guoping",
  title        = "A Span-Extraction Dataset for Chinese Machine Reading
                  Comprehension",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.07366v2",
  abstract     = "Machine Reading Comprehension (MRC) has become enormously
                  popular recently and has attracted a lot of
                  attention. However, the existing reading comprehension
                  datasets are mostly in English. In this paper, we introduce a
                  Span-Extraction dataset for Chinese machine reading
                  comprehension to add language diversities in this area. The
                  dataset is composed by near 20,000 real questions annotated
                  on Wikipedia paragraphs by human experts. We also annotated a
                  challenge set which contains the questions that need
                  comprehensive understanding and multi-sentence inference
                  throughout the context. We present several baseline systems
                  as well as anonymous submissions for demonstrating the
                  difficulties in this dataset. With the release of the
                  dataset, we hosted the Second Evaluation Workshop on Chinese
                  Machine Reading Comprehension (CMRC 2018). We hope the
                  release of the dataset could further accelerate the Chinese
                  machine reading comprehension research. Resources are
                  available: https://github.com/ymcui/cmrc2018",
  archivePrefix= "arXiv",
  eprint       = "1810.07366",
  primaryClass = "cs.CL"
}

@article{cui-2020-cmrc-2019,
  author       = "Cui, Yiming and Liu, Ting and Yang, Ziqing and Chen, Zhipeng
                  and Ma, Wentao and Che, Wanxiang and Wang, Shijin and Hu,
                  Guoping",
  title        = "A Sentence Cloze Dataset for Chinese Machine Reading
                  Comprehension",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2004.03116v1",
  abstract     = "Owing to the continuous contributions by the Chinese NLP
                  community, more and more Chinese machine reading
                  comprehension datasets become available, and they have been
                  pushing Chinese MRC research forward. To add diversity in
                  this area, in this paper, we propose a new task called
                  Sentence Cloze-style Machine Reading Comprehension
                  (SC-MRC). The proposed task aims to fill the right candidate
                  sentence into the passage that has several blanks. Moreover,
                  to add more difficulties, we also made fake candidates that
                  are similar to the correct ones, which requires the machine
                  to judge their correctness in the context. The proposed
                  dataset contains over 100K blanks (questions) within over 10K
                  passages, which was originated from Chinese narrative
                  stories. To evaluate the dataset, we implement several
                  baseline systems based on pre-trained models, and the results
                  show that the state-of-the-art model still underperforms
                  human performance by a large margin. We hope the release of
                  the dataset could further accelerate the machine reading
                  comprehension research. Resources available:
                  https://github.com/ymcui/cmrc2019",
  archivePrefix= "arXiv",
  eprint       = "2004.03116",
  primaryClass = "cs.CL"
}

@article{munkhdalai-2016-neural-tree-indexers,
  author       = "Munkhdalai, Tsendsuren and Yu, Hong",
  title        = "Neural Tree Indexers for Text Understanding",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1607.04492v2",
  abstract     = "Recurrent neural networks (RNNs) process input text
                  sequentially and model the conditional transition between
                  word tokens. In contrast, the advantages of recursive
                  networks include that they explicitly model the
                  compositionality and the recursive structure of natural
                  language. However, the current recursive architecture is
                  limited by its dependence on syntactic tree. In this paper,
                  we introduce a robust syntactic parsing-independent tree
                  structured model, Neural Tree Indexers (NTI) that provides a
                  middle ground between the sequential RNNs and the syntactic
                  treebased recursive models. NTI constructs a full n-ary tree
                  by processing the input text with its node function in a
                  bottom-up fashion.  Attention mechanism can then be applied
                  to both structure and node function. We implemented and
                  evaluated a binarytree model of NTI, showing the model
                  achieved the state-of-the-art performance on three different
                  NLP tasks: natural language inference, answer sentence
                  selection, and sentence classification, outperforming
                  state-of-the-art recurrent and recursive neural networks.",
  archivePrefix= "arXiv",
  eprint       = "1607.04492",
  primaryClass = "cs.CL"
}

@inproceedings{grandvalet-2004-entropy-minimization,
  author       = "Grandvalet, Yves and Bengio, Yoshua",
  title        = "Semi-Supervised Learning by Entropy Minimization",
  year         = 2004,
  publisher    = "MIT Press",
  address      = "Cambridge, MA, USA",
  booktitle    = "Proceedings of the 17th International Conference on Neural
                  Information Processing Systems",
  pages        = "529–536",
  numpages     = 8,
  location     = "Vancouver, British Columbia, Canada",
  series       = "NIPS’04"
}

@inproceedings{duan-2017-qg-for-qa,
  title        = "Question Generation for Question Answering",
  author       = "Duan, Nan and Tang, Duyu and Chen, Peng and Zhou, Ming",
  booktitle    = "Proceedings of the 2017 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D17-1090",
  doi          = "10.18653/v1/D17-1090",
  pages        = "866-874",
  abstract     = "This paper presents how to generate questions from given
                  passages using neural networks, where large scale QA pairs
                  are automatically crawled and processed from Community-QA
                  website, and used as training data. The contribution of the
                  paper is 2-fold: First, two types of question generation
                  approaches are proposed, one is a retrieval-based method
                  using convolution neural network (CNN), the other is a
                  generation-based method using recurrent neural network (RNN);
                  Second, we show how to leverage the generated questions to
                  improve existing question answering systems. We evaluate our
                  question generation method for the answer sentence selection
                  task on three benchmark datasets, including SQuAD, MS MARCO,
                  and WikiQA. Experimental results show that, by using
                  generated questions as an extra signal, significant QA
                  improvement can be achieved."
}

@inproceedings{hadsell-2006-contrastive-loss,
  title        = "Dimensionality reduction by learning an invariant mapping",
  author       = "Hadsell, Raia and Chopra, Sumit and LeCun, Yann",
  booktitle    = "2006 IEEE Computer Society Conference on Computer Vision and
                  Pattern Recognition (CVPR'06)",
  volume       = 2,
  pages        = "1735-1742",
  year         = 2006,
  organization = "IEEE"
}

@article{wieting-2019-no-training-required,
  author       = "Wieting, John and Kiela, Douwe",
  title        = "No Training Required: Exploring Random Encoders for Sentence
                  Classification",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1901.10444v1",
  abstract     = "We explore various methods for computing sentence
                  representations from pre-trained word embeddings without any
                  training, i.e., using nothing but random
                  parameterizations. Our aim is to put sentence embeddings on
                  more solid footing by 1) looking at how much modern sentence
                  embeddings gain over random methods---as it turns out,
                  surprisingly little; and by 2) providing the field with more
                  appropriate baselines going forward---which are, as it turns
                  out, quite strong. We also make important observations about
                  proper experimental protocol for sentence classification
                  evaluation, together with recommendations for future
                  research.",
  archivePrefix= "arXiv",
  eprint       = "1901.10444",
  primaryClass = "cs.CL"
}

@inproceedings{mohtarami-2016-sls-semeval-task,
  title        = "{SLS} at {S}em{E}val-2016 Task 3: Neural-based Approaches for
                  Ranking in Community Question Answering",
  author       = "Mohtarami, Mitra and Belinkov, Yonatan and Hsu, Wei-Ning and
                  Zhang, Yu and Lei, Tao and Bar, Kfir and Cyphers, Scott and
                  Glass, Jim",
  booktitle    = "Proceedings of the 10th International Workshop on Semantic
                  Evaluation ({S}em{E}val-2016)",
  month        = jun,
  year         = 2016,
  address      = "San Diego, California",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/S16-1128",
  doi          = "10.18653/v1/S16-1128",
  pages        = "828-835"
}

@inproceedings{romeo-2016-neural-attention,
  title        = "Neural Attention for Learning to Rank Questions in Community
                  Question Answering",
  author       = "Romeo, Salvatore and Da San Martino, Giovanni and
                  Barr{\'o}n-Cede{\~n}o, Alberto and Moschitti, Alessandro and
                  Belinkov, Yonatan and Hsu, Wei-Ning and Zhang, Yu and
                  Mohtarami, Mitra and Glass, James",
  booktitle    = "Proceedings of {COLING} 2016, the 26th International
                  Conference on Computational Linguistics: Technical Papers",
  month        = dec,
  year         = 2016,
  address      = "Osaka, Japan",
  publisher    = "The COLING 2016 Organizing Committee",
  url          = "https://www.aclweb.org/anthology/C16-1163",
  pages        = "1734-1745",
  abstract     = "In real-world data, e.g., from Web forums, text is often
                  contaminated with redundant or irrelevant content, which
                  leads to introducing noise in machine learning algorithms. In
                  this paper, we apply Long Short-Term Memory networks with an
                  attention mechanism, which can select important parts of text
                  for the task of similar question retrieval from community
                  Question Answering (cQA) forums. In particular, we use the
                  attention weights for both selecting entire sentences and
                  their subparts, i.e., word/chunk, from shallow syntactic
                  trees. More interestingly, we apply tree kernels to the
                  filtered text representations, thus exploiting the implicit
                  features of the subtree space for learning question
                  reranking. Our results show that the attention-based pruning
                  allows for achieving the top position in the cQA challenge of
                  SemEval 2016, with a relatively large gap from the other
                  participants while greatly decreasing running time."
}

@inproceedings{nassif-2016-learning-semantic-relatedness,
  title        = "Learning Semantic Relatedness in Community Question Answering
                  Using Neural Models",
  author       = "Nassif, Henry and Mohtarami, Mitra and Glass, James",
  booktitle    = "Proceedings of the 1st Workshop on Representation Learning
                  for {NLP}",
  month        = aug,
  year         = 2016,
  address      = "Berlin, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W16-1616",
  doi          = "10.18653/v1/W16-1616",
  pages        = "137-147"
}

@inproceedings{nakov-2016-semeval-task,
  title        = "{S}em{E}val-2016 Task 3: Community Question Answering",
  author       = "Nakov, Preslav and M{\`a}rquez, Llu{\'\i}s and Moschitti,
                  Alessandro and Magdy, Walid and Mubarak, Hamdy and Freihat,
                  Abed Alhakim and Glass, Jim and Randeree, Bilal",
  booktitle    = "Proceedings of the 10th International Workshop on Semantic
                  Evaluation ({S}em{E}val-2016)",
  month        = jun,
  year         = 2016,
  address      = "San Diego, California",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/S16-1083",
  doi          = "10.18653/v1/S16-1083",
  pages        = "525-545"
}

@inproceedings{belinkov-2015-vectorslu,
  title        = "{V}ector{SLU}: A Continuous Word Vector Approach to Answer
                  Selection in Community Question Answering Systems",
  author       = "Belinkov, Yonatan and Mohtarami, Mitra and Cyphers, Scott and
                  Glass, James",
  booktitle    = "Proceedings of the 9th International Workshop on Semantic
                  Evaluation ({S}em{E}val 2015)",
  month        = jun,
  year         = 2015,
  address      = "Denver, Colorado",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/S15-2048",
  doi          = "10.18653/v1/S15-2048",
  pages        = "282-287"
}

@inproceedings{nakov-2015-semeval-task,
  title        = "{S}em{E}val-2015 Task 3: Answer Selection in Community
                  Question Answering",
  author       = "Nakov, Preslav and M{\`a}rquez, Llu{\'\i}s and Magdy, Walid
                  and Moschitti, Alessandro and Glass, Jim and Randeree, Bilal",
  booktitle    = "Proceedings of the 9th International Workshop on Semantic
                  Evaluation ({S}em{E}val 2015)",
  month        = jun,
  year         = 2015,
  address      = "Denver, Colorado",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/S15-2047",
  doi          = "10.18653/v1/S15-2047",
  pages        = "269-281"
}

@article{loshchilov-2017-adamw,
  author       = "Loshchilov, Ilya and Hutter, Frank",
  title        = "Decoupled Weight Decay Regularization",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.05101v3",
  abstract     = "L$_2$ regularization and weight decay regularization are
                  equivalent for standard stochastic gradient descent (when
                  rescaled by the learning rate), but as we demonstrate this is
                  \emph{not} the case for adaptive gradient algorithms, such as
                  Adam. While common implementations of these algorithms employ
                  L$_2$ regularization (often calling it ``weight decay`` in
                  what may be misleading due to the inequivalence we expose),
                  we propose a simple modification to recover the original
                  formulation of weight decay regularization by
                  \emph{decoupling} the weight decay from the optimization
                  steps taken w.r.t. the loss function. We provide empirical
                  evidence that our proposed modification (i) decouples the
                  optimal choice of weight decay factor from the setting of the
                  learning rate for both standard SGD and Adam and (ii)
                  substantially improves Adam's generalization performance,
                  allowing it to compete with SGD with momentum on image
                  classification datasets (on which it was previously typically
                  outperformed by the latter). Our proposed decoupled weight
                  decay has already been adopted by many researchers, and the
                  community has implemented it in TensorFlow and PyTorch; the
                  complete source code for our experiments is available at
                  https://github.com/loshchil/AdamW-and-SGDW",
  archivePrefix= "arXiv",
  eprint       = "1711.05101",
  primaryClass = "cs.LG"
}

@article{wang-2014-hashining-similarity-search,
  author       = "Wang, Jingdong and Shen, Heng Tao and Song, Jingkuan and Ji,
                  Jianqiu",
  title        = "Hashing for Similarity Search: a Survey",
  journal      = "CoRR",
  year         = 2014,
  url          = "http://arxiv.org/abs/1408.2927v1",
  abstract     = "Similarity search (nearest neighbor search) is a problem of
                  pursuing the data items whose distances to a query item are
                  the smallest from a large database.  Various methods have
                  been developed to address this problem, and recently a lot of
                  efforts have been devoted to approximate search. In this
                  paper, we present a survey on one of the main solutions,
                  hashing, which has been widely studied since the pioneering
                  work locality sensitive hashing. We divide the hashing
                  algorithms two main categories: locality sensitive hashing,
                  which designs hash functions without exploring the data
                  distribution and learning to hash, which learns hash
                  functions according the data distribution, and review them
                  from various aspects, including hash function design and
                  distance measure and search scheme in the hash coding space.",
  archivePrefix= "arXiv",
  eprint       = "1408.2927",
  primaryClass = "cs.DS"
}

@inproceedings{yang-2008-ilp,
  title        = "An Entity-Mention Model for Coreference Resolution with
                  Inductive Logic Programming",
  author       = "Yang, Xiaofeng and Su, Jian and Lang, Jun and Tan, Chew Lim
                  and Liu, Ting and Li, Sheng",
  booktitle    = "Proceedings of ACL-08: HLT",
  month        = jun,
  year         = 2008,
  address      = "Columbus, Ohio",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/P08-1096",
  pages        = "843-851"
}

@article{cui-2019-kbqa,
  author       = "Cui, Wanyun and Xiao, Yanghua and Wang, Haixun and Song,
                  Yangqiu and Hwang, Seung-won and Wang, Wei",
  title        = "Kbqa: Learning Question Answering Over Qa Corpora and
                  Knowledge Bases",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.02419v1",
  abstract     = "Question answering (QA) has become a popular way for humans
                  to access billion-scale knowledge bases. Unlike web search,
                  QA over a knowledge base gives out accurate and concise
                  results, provided that natural language questions can be
                  understood and mapped precisely to structured queries over
                  the knowledge base. The challenge, however, is that a human
                  can ask one question in many different ways. Previous
                  approaches have natural limits due to their representations:
                  rule based approaches only understand a small set of
                  ``canned`` questions, while keyword based or synonym based
                  approaches cannot fully understand the questions. In this
                  paper, we design a new kind of question representation:
                  templates, over a billion scale knowledge base and a million
                  scale QA corpora. For example, for questions about a city's
                  population, we learn templates such as What's the population
                  of $city?, How many people are there in $city?. We learned 27
                  million templates for 2782 intents. Based on these templates,
                  our QA system KBQA effectively supports binary factoid
                  questions, as well as complex questions which are composed of
                  a series of binary factoid questions. Furthermore, we expand
                  predicates in RDF knowledge base, which boosts the coverage
                  of knowledge base by 57 times. Our QA system beats all other
                  state-of-art works on both effectiveness and efficiency over
                  QALD benchmarks.",
  archivePrefix= "arXiv",
  eprint       = "1903.02419",
  primaryClass = "cs.CL"
}

@article{bordes-2014-open-qa,
  author       = "Bordes, Antoine and Weston, Jason and Usunier, Nicolas",
  title        = "Open Question Answering With Weakly Supervised Embedding
                  Models",
  journal      = "CoRR",
  year         = 2014,
  url          = "http://arxiv.org/abs/1404.4326v1",
  abstract     = "Building computers able to answer questions on any subject is
                  a long standing goal of artificial intelligence. Promising
                  progress has recently been achieved by methods that learn to
                  map questions to logical forms or database queries.  Such
                  approaches can be effective but at the cost of either large
                  amounts of human-labeled data or by defining lexicons and
                  grammars tailored by practitioners. In this paper, we instead
                  take the radical approach of learning to map questions to
                  vectorial feature representations. By mapping answers into
                  the same space one can query any knowledge base independent
                  of its schema, without requiring any grammar or lexicon. Our
                  method is trained with a new optimization procedure combining
                  stochastic gradient descent followed by a fine-tuning step
                  using the weak supervision provided by blending automatically
                  and collaboratively generated resources. We empirically
                  demonstrate that our model can capture meaningful signals
                  from its noisy supervision leading to major improvements over
                  paralex, the only existing method able to be trained on
                  similar weakly labeled data.",
  archivePrefix= "arXiv",
  eprint       = "1404.4326",
  primaryClass = "cs.CL"
}

@inproceedings{zhao-2011-auto-qg,
  title        = "Automatically Generating Questions from Queries for
                  Community-based Question Answering",
  author       = "Zhao, Shiqi and Wang, Haifeng and Li, Chao and Liu, Ting and
                  Guan, Yi",
  booktitle    = "Proceedings of 5th International Joint Conference on Natural
                  Language Processing",
  month        = nov,
  year         = 2011,
  address      = "Chiang Mai, Thailand",
  publisher    = "Asian Federation of Natural Language Processing",
  url          = "https://www.aclweb.org/anthology/I11-1104",
  pages        = "929-937"
}

@article{yuan-2017-neural-qg,
  author       = "Yuan, Xingdi and Wang, Tong and Gulcehre, Caglar and Sordoni,
                  Alessandro and Bachman, Philip and Subramanian, Sandeep and
                  Zhang, Saizheng and Trischler, Adam",
  title        = "Machine Comprehension By Text-To-Text Neural Question
                  Generation",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1705.02012v2",
  abstract     = "We propose a recurrent neural model that generates
                  natural-language questions from documents, conditioned on
                  answers. We show how to train the model using a combination
                  of supervised and reinforcement learning. After teacher
                  forcing for standard maximum likelihood training, we
                  fine-tune the model using policy gradient techniques to
                  maximize several rewards that measure question quality.  Most
                  notably, one of these rewards is the performance of a
                  question-answering system. We motivate question generation as
                  a means to improve the performance of question answering
                  systems. Our model is trained and evaluated on the recent
                  question-answering dataset SQuAD.",
  archivePrefix= "arXiv",
  eprint       = "1705.02012",
  primaryClass = "cs.CL"
}

@article{subramanian-2017-neural-qg,
  author       = "Subramanian, Sandeep and Wang, Tong and Yuan, Xingdi and
                  Zhang, Saizheng and Bengio, Yoshua and Trischler, Adam",
  title        = "Neural Models for Key Phrase Detection and Question
                  Generation",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.04560v3",
  abstract     = "We propose a two-stage neural model to tackle question
                  generation from documents. First, our model estimates the
                  probability that word sequences in a document are ones that a
                  human would pick when selecting candidate answers by training
                  a neural key-phrase extractor on the answers in a
                  question-answering corpus. Predicted key phrases then act as
                  target answers and condition a sequence-to-sequence
                  question-generation model with a copy mechanism.
                  Empirically, our key-phrase extraction model significantly
                  outperforms an entity-tagging baseline and existing
                  rule-based approaches. We further demonstrate that our
                  question generation system formulates fluent, answerable
                  questions from key phrases. This two-stage system could be
                  used to augment or generate reading comprehension datasets,
                  which may be leveraged to improve machine reading systems or
                  in educational settings.",
  archivePrefix= "arXiv",
  eprint       = "1706.04560",
  primaryClass = "cs.CL"
}

@inproceedings{rao-2019-gan-qg,
  title        = "{A}nswer-based {A}dversarial {T}raining for {G}enerating
                  {C}larification {Q}uestions",
  author       = "Rao, Sudha and Daum{\'e} III, Hal",
  booktitle    = "Proceedings of the 2019 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long and Short
                  Papers)",
  month        = jun,
  year         = 2019,
  address      = "Minneapolis, Minnesota",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N19-1013",
  doi          = "10.18653/v1/N19-1013",
  pages        = "143-155",
  abstract     = "We present an approach for generating clarification questions
                  with the goal of eliciting new information that would make
                  the given textual context more complete. We propose that
                  modeling hypothetical answers (to clarification questions) as
                  latent variables can guide our approach into generating more
                  useful clarification questions. We develop a Generative
                  Adversarial Network (GAN) where the generator is a
                  sequence-to-sequence model and the discriminator is a utility
                  function that models the value of updating the context with
                  the answer to the clarification question. We evaluate on two
                  datasets, using both automatic metrics and human judgments of
                  usefulness, specificity and relevance, showing that our
                  approach outperforms both a retrieval-based model and
                  ablations that exclude the utility model and the adversarial
                  training."
}

@inproceedings{heilman-2010-good-question,
  title        = "Good question! statistical ranking for question generation",
  author       = "Heilman, Michael and Smith, Noah A",
  booktitle    = "Human Language Technologies: The 2010 Annual Conference of
                  the North American Chapter of the Association for
                  Computational Linguistics",
  pages        = "609-617",
  year         = 2010,
  organization = "Association for Computational Linguistics"
}

@article{tang-2017-qa-qg-dual-task,
  author       = "Tang, Duyu and Duan, Nan and Qin, Tao and Yan, Zhao and Zhou,
                  Ming",
  title        = "Question Answering and Question Generation As Dual Tasks",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.02027v2",
  abstract     = "We study the problem of joint question answering (QA) and
                  question generation (QG) in this paper.  Our intuition is
                  that QA and QG have intrinsic connections and these two tasks
                  could improve each other.  On one side, the QA model judges
                  whether the generated question of a QG model is relevant to
                  the answer.  On the other side, the QG model provides the
                  probability of generating a question given the answer, which
                  is a useful evidence that in turn facilitates QA.  In this
                  paper we regard QA and QG as dual tasks.  We propose a
                  training framework that trains the models of QA and QG
                  simultaneously, and explicitly leverages their probabilistic
                  correlation to guide the training process of both models.  We
                  implement a QG model based on sequence-to-sequence learning,
                  and a QA model based on recurrent neural network.  As all the
                  components of the QA and QG models are differentiable, all
                  the parameters involved in these two models could be
                  conventionally learned with back propagation.  We conduct
                  experiments on three datasets. Empirical results show that
                  our training framework improves both QA and QG tasks.  The
                  improved QA model performs comparably with strong baseline
                  approaches on all three datasets.",
  archivePrefix= "arXiv",
  eprint       = "1706.02027",
  primaryClass = "cs.CL"
}

@article{wang-2017-joint-qa-qg,
  author       = "Wang, Tong and Yuan, Xingdi and Trischler, Adam",
  title        = "A Joint Model for Question Answering and Question Generation",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1706.01450v1",
  abstract     = "We propose a generative machine comprehension model that
                  learns jointly to ask and answer questions based on
                  documents. The proposed model uses a sequence-to-sequence
                  framework that encodes the document and generates a question
                  (answer) given an answer (question). Significant improvement
                  in model performance is observed empirically on the SQuAD
                  corpus, confirming our hypothesis that the model benefits
                  from jointly learning to perform both tasks.  We believe the
                  joint model's novelty offers a new perspective on machine
                  comprehension beyond architectural engineering, and serves as
                  a first step towards autonomous information seeking.",
  archivePrefix= "arXiv",
  eprint       = "1706.01450",
  primaryClass = "cs.CL"
}

@article{yang-2017-qa-dan,
  author       = "Yang, Zhilin and Hu, Junjie and Salakhutdinov, Ruslan and
                  Cohen, William W.",
  title        = "Semi-Supervised Qa With Generative Domain-Adaptive Nets",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1702.02206v2",
  abstract     = "We study the problem of semi-supervised question
                  answering----utilizing unlabeled text to boost the
                  performance of question answering models. We propose a novel
                  training framework, the Generative Domain-Adaptive Nets. In
                  this framework, we train a generative model to generate
                  questions based on the unlabeled text, and combine
                  model-generated questions with human-generated questions for
                  training question answering models. We develop novel domain
                  adaptation algorithms, based on reinforcement learning, to
                  alleviate the discrepancy between the model-generated data
                  distribution and the human-generated data
                  distribution. Experiments show that our proposed framework
                  obtains substantial improvement from unlabeled text.",
  archivePrefix= "arXiv",
  eprint       = "1702.02206",
  primaryClass = "cs.CL"
}

@article{jegou-2011-product-quantization,
  author       = "Jegou, Herve and Douze, Matthijs and Schmid, Cordelia",
  title        = "Product Quantization for Nearest Neighbor Search",
  year         = 2011,
  issue_date   = "January 2011",
  publisher    = "IEEE Computer Society",
  address      = "USA",
  volume       = 33,
  number       = 1,
  issn         = "0162-8828",
  url          = "https://doi.org/10.1109/TPAMI.2010.57",
  doi          = "10.1109/TPAMI.2010.57",
  journal      = "IEEE Trans. Pattern Anal. Mach. Intell.",
  month        = jan,
  pages        = "117–128",
  numpages     = 12,
  keywords     = "High-dimensional indexing, image indexing, very large
                  databases, High-dimensional indexing, image indexing, very
                  large databases, approximate search., approximate search."
}

@article{tay-2018-csran,
  author       = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
  title        = "Co-Stack Residual Affinity Networks With Multi-Level
                  Attention Refinement for Matching Text Sequences",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.02938v1",
  abstract     = "Learning a matching function between two text sequences is a
                  long standing problem in NLP research. This task enables many
                  potential applications such as question answering and
                  paraphrase identification. This paper proposes Co-Stack
                  Residual Affinity Networks (CSRAN), a new and universal
                  neural architecture for this problem. CSRAN is a deep
                  architecture, involving stacked (multi-layered) recurrent
                  encoders. Stacked/Deep architectures are traditionally
                  difficult to train, due to the inherent weaknesses such as
                  difficulty with feature propagation and vanishing
                  gradients. CSRAN incorporates two novel components to take
                  advantage of the stacked architecture. Firstly, it introduces
                  a new bidirectional alignment mechanism that learns affinity
                  weights by fusing sequence pairs across stacked
                  hierarchies. Secondly, it leverages a multi-level attention
                  refinement component between stacked recurrent layers. The
                  key intuition is that, by leveraging information across all
                  network hierarchies, we can not only improve gradient flow
                  but also improve overall performance. We conduct extensive
                  experiments on six well-studied text sequence matching
                  datasets, achieving state-of-the-art performance on all.",
  archivePrefix= "arXiv",
  eprint       = "1810.02938",
  primaryClass = "cs.CL"
}

@inproceedings{he-2016-pairwise-word-interaction,
  title        = "Pairwise Word Interaction Modeling with Deep Neural Networks
                  for Semantic Similarity Measurement",
  author       = "He, Hua and Lin, Jimmy",
  booktitle    = "Proceedings of the 2016 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies",
  month        = jun,
  year         = 2016,
  address      = "San Diego, California",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N16-1108",
  doi          = "10.18653/v1/N16-1108",
  pages        = "937-948"
}

@article{zhang-2020-soft-masked-bert,
  author       = "Zhang, Shaohua and Huang, Haoran and Liu, Jicong and Li,
                  Hang",
  title        = "Spelling Error Correction With Soft-Masked Bert",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2005.07421v1",
  abstract     = "Spelling error correction is an important yet challenging
                  task because a satisfactory solution of it essentially needs
                  human-level language understanding ability. Without loss of
                  generality we consider Chinese spelling error correction
                  (CSC) in this paper. A state-of-the-art method for the task
                  selects a character from a list of candidates for correction
                  (including non-correction) at each position of the sentence
                  on the basis of BERT, the language representation model. The
                  accuracy of the method can be sub-optimal, however, because
                  BERT does not have sufficient capability to detect whether
                  there is an error at each position, apparently due to the way
                  of pre-training it using mask language modeling. In this
                  work, we propose a novel neural architecture to address the
                  aforementioned issue, which consists of a network for error
                  detection and a network for error correction based on BERT,
                  with the former being connected to the latter with what we
                  call soft-masking technique.  Our method of using
                  `Soft-Masked BERT' is general, and it may be employed in
                  other language detection-correction problems. Experimental
                  results on two datasets demonstrate that the performance of
                  our proposed method is significantly better than the
                  baselines including the one solely based on BERT.",
  archivePrefix= "arXiv",
  eprint       = "2005.07421",
  primaryClass = "cs.CL"
}

@inproceedings{sarikaya-2016-cortana,
  title        = "An overview of end-to-end language understanding and dialog
                  management for personal digital assistants",
  author       = "Sarikaya, Ruhi and Crook, Paul A and Marin, Alex and Jeong,
                  Minwoo and Robichaud, Jean-Philippe and Celikyilmaz, Asli and
                  Kim, Young-Bum and Rochette, Alexandre and Khan, Omar Zia and
                  Liu, Xiaohu and others",
  booktitle    = "2016 ieee spoken language technology workshop (slt)",
  pages        = "391-397",
  year         = 2016,
  organization = "IEEE"
}

@article{williams-2017-hcn,
  author       = "Williams, Jason D. and Asadi, Kavosh and Zweig, Geoffrey",
  title        = "Hybrid Code Networks: Practical and Efficient End-To-End
                  Dialog Control With Supervised and Reinforcement Learning",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1702.03274v2",
  abstract     = "End-to-end learning of recurrent neural networks (RNNs) is an
                  attractive solution for dialog systems; however, current
                  techniques are data-intensive and require thousands of
                  dialogs to learn simple behaviors. We introduce Hybrid Code
                  Networks (HCNs), which combine an RNN with domain-specific
                  knowledge encoded as software and system action
                  templates. Compared to existing end-to-end approaches, HCNs
                  considerably reduce the amount of training data required,
                  while retaining the key benefit of inferring a latent
                  representation of dialog state. In addition, HCNs can be
                  optimized with supervised learning, reinforcement learning,
                  or a mixture of both. HCNs attain state-of-the-art
                  performance on the bAbI dialog dataset, and outperform two
                  commercially deployed customer-facing dialog systems.",
  archivePrefix= "arXiv",
  eprint       = "1702.03274",
  primaryClass = "cs.AI"
}

@article{anh-2017-hybrid-bi-lstm-crf,
  author       = "Anh, L. T. and Arkhipov, M. Y. and Burtsev, M. S.",
  title        = "Application of a Hybrid Bi-Lstm-Crf Model To the Task of
                  Russian Named Entity Recognition",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1709.09686v2",
  abstract     = "Named Entity Recognition (NER) is one of the most common
                  tasks of the natural language processing. The purpose of NER
                  is to find and classify tokens in text documents into
                  predefined categories called tags, such as person names,
                  quantity expressions, percentage expressions, names of
                  locations, organizations, as well as expression of time,
                  currency and others. Although there is a number of approaches
                  have been proposed for this task in Russian language, it
                  still has a substantial potential for the better
                  solutions. In this work, we studied several deep neural
                  network models starting from vanilla Bi-directional Long
                  Short-Term Memory (Bi-LSTM) then supplementing it with
                  Conditional Random Fields (CRF) as well as highway networks
                  and finally adding external word embeddings. All models were
                  evaluated across three datasets: Gareev's dataset,
                  Person-1000, FactRuEval-2016. We found that extension of
                  Bi-LSTM model with CRF significantly increased the quality of
                  predictions.  Encoding input tokens with external word
                  embeddings reduced training time and allowed to achieve state
                  of the art for the Russian NER task.",
  archivePrefix= "arXiv",
  eprint       = "1709.09686",
  primaryClass = "cs.CL"
}

@article{le-2019-deep-neural,
  author       = "Lê, Thế Anh",
  year         = 2019,
  month        = 02,
  title        = "A Deep Neural Network Model for the task of Named Entity
                  Recognition",
  volume       = 9,
  journal      = "International Journal of Machine Learning and Computing",
  doi          = "10.18178/ijmlc.2019.9.1.758"
}

@inproceedings{le-2020-sla-to-sbd,
  author       = "Le, The Anh",
  title        = "Sequence Labeling Approach to the Task of Sentence Boundary
                  Detection",
  year         = 2020,
  isbn         = 9781450376310,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/3380688.3380703",
  doi          = "10.1145/3380688.3380703",
  booktitle    = "Proceedings of the 4th International Conference on Machine
                  Learning and Soft Computing",
  pages        = "144–148",
  numpages     = 5,
  keywords     = "voice-enabled chatbot, sequence labeling, Sentence boundary
                  detection",
  location     = "Haiphong City, Viet Nam",
  series       = "ICMLSC 2020"
}

@article{gao-2018-neural-conversation,
  author       = "Gao, Jianfeng and Galley, Michel and Li, Lihong",
  title        = "Neural Approaches To Conversational Ai",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1809.08267v3",
  abstract     = "The present paper surveys neural approaches to conversational
                  AI that have been developed in the last few years. We group
                  conversational systems into three categories: (1) question
                  answering agents, (2) task-oriented dialogue agents, and (3)
                  chatbots. For each category, we present a review of
                  state-of-the-art neural approaches, draw the connection
                  between them and traditional approaches, and discuss the
                  progress that has been made and challenges still being faced,
                  using specific systems and models as case studies.",
  archivePrefix= "arXiv",
  eprint       = "1809.08267",
  primaryClass = "cs.CL"
}

@inproceedings{kurata-2016-sentence-level-slot-filling,
  title        = "Leveraging Sentence-level Information with Encoder {LSTM} for
                  Semantic Slot Filling",
  author       = "Kurata, Gakuto and Xiang, Bing and Zhou, Bowen and Yu, Mo",
  booktitle    = "Proceedings of the 2016 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = nov,
  year         = 2016,
  address      = "Austin, Texas",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D16-1223",
  doi          = "10.18653/v1/D16-1223",
  pages        = "2077-2083"
}

@article{jaech-2016-da-for-nlu,
  author       = "Jaech, Aaron and Heck, Larry and Ostendorf, Mari",
  title        = "Domain Adaptation of Recurrent Neural Networks for Natural
                  Language Understanding",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1604.00117v2",
  abstract     = "The goal of this paper is to use multi-task learning to
                  efficiently scale slot filling models for natural language
                  understanding to handle multiple target tasks or domains. The
                  key to scalability is reducing the amount of training data
                  needed to learn a model for a new task. The proposed
                  multi-task model delivers better performance with less data
                  by leveraging patterns that it learns from the other
                  tasks. The approach supports an open vocabulary, which allows
                  the models to generalize to unseen words, which is
                  particularly important when very little training data is
                  used. A newly collected crowd-sourced data set, covering four
                  different domains, is used to demonstrate the effectiveness
                  of the domain adaptation and open vocabulary techniques.",
  archivePrefix= "arXiv",
  eprint       = "1604.00117",
  primaryClass = "cs.CL"
}

@inproceedings{tafforeau-2016-multitask-slu,
  title        = "Joint Syntactic and Semantic Analysis with a Multitask Deep
                  Learning Framework for Spoken Language Understanding",
  author       = "J{\'e}r{\'e}mie Tafforeau and Fr{\'e}d{\'e}ric B{\'e}chet and
                  Thierry Arti{\`e}res and Beno{\^i}t Favre",
  booktitle    = "INTERSPEECH",
  year         = 2016
}

@InProceedings{hakkani-tur-2016-joint-semantic-frame-parsing,
  author       = "Hakkani-Tür, Dilek and Tur, Gokhan and Celikyilmaz, Asli and
                  Chen, Yun-Nung Vivian and Gao, Jianfeng and Deng, Li and
                  Wang, Ye-Yi",
  title        = "Multi-Domain Joint Semantic Frame Parsing using
                  Bi-directional RNN-LSTM",
  booktitle    = "Proceedings of The 17th Annual Meeting of the International
                  Speech Communication Association (INTERSPEECH 2016)",
  year         = 2016,
  month        = "June",
  abstract     = "Sequence-to-sequence deep learning has recently emerged as a
                  new paradigm in supervised learning for spoken language
                  understanding. However, most of the previous studies explored
                  this framework for building single domain models for each
                  task, such as slot filling or domain classification,
                  comparing deep learning based approaches with conventional
                  ones like conditional random fields. This paper proposes a
                  holistic multi-domain, multi-task (i.e. slot filling, domain
                  and intent detection) modeling approach to estimate complete
                  semantic frames for all user utterances addressed to a
                  conversational system, demonstrating the distinctive power of
                  deep learning methods, namely bi-directional recurrent neural
                  network (RNN) with long-short term memory (LSTM) cells
                  (RNN-LSTM) to handle such complexity. The contributions of
                  the presented work are three-fold: (i) we propose an RNN-LSTM
                  architecture for joint modeling of slot filling, intent
                  determination, and domain classification; (ii) we build a
                  joint multi-domain model enabling multi-task deep learning
                  where the data from each domain reinforces each other; (iii)
                  we investigate alternative architectures for modeling lexical
                  context in spoken language understanding. In addition to the
                  simplicity of the single model framework, experimental
                  results show the power of such an approach on Microsoft
                  Cortana real user data over alternative methods based on
                  single domain/task deep learning.",
  publisher    = "ISCA",
  url          =
                  "https://www.microsoft.com/en-us/research/publication/multijoint/",
  edition      = "Proceedings of The 17th Annual Meeting of the International
                  Speech Communication Association (INTERSPEECH 2016)"
}

@article{liu-2016-joint-intent-detection-slot-filling,
  author       = "Liu, Bing and Lane, Ian",
  title        = "Attention-Based Recurrent Neural Network Models for Joint
                  Intent Detection and Slot Filling",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1609.01454v1",
  abstract     = "Attention-based encoder-decoder neural network models have
                  recently shown promising results in machine translation and
                  speech recognition. In this work, we propose an
                  attention-based neural network model for joint intent
                  detection and slot filling, both of which are critical steps
                  for many speech understanding and dialog systems. Unlike in
                  machine translation and speech recognition, alignment is
                  explicit in slot filling. We explore different strategies in
                  incorporating this alignment information to the
                  encoder-decoder framework. Learning from the attention
                  mechanism in encoder-decoder model, we further propose
                  introducing attention to the alignment-based RNN models. Such
                  attentions provide additional information to the intent
                  classification and slot label prediction. Our independent
                  task models achieve state-of-the-art intent detection error
                  rate and slot filling F1 score on the benchmark ATIS
                  task. Our joint training model further obtains 0.56 \%
                  absolute (23.8 \% relative) error reduction on intent
                  detection and 0.23 \% absolute gain on slot filling over the
                  independent task models.",
  archivePrefix= "arXiv",
  eprint       = "1609.01454",
  primaryClass = "cs.CL"
}

@inproceedings{hori-2016-contextual-slu,
  title        = "Context-Sensitive and Role-Dependent Spoken Language
                  Understanding Using Bidirectional and Attention LSTMs",
  author       = "Chiori Hori and Takaaki Hori and Shinji Watanabe and John
                  R. Hershey",
  booktitle    = "INTERSPEECH",
  year         = 2016
}

@INPROCEEDINGS{bhargava-2013-easy-slot-detection,
  author       = "A. {Bhargava} and A. {Celikyilmaz} and D. {Hakkani-Tür} and
                  R. {Sarikaya}",
  booktitle    = "2013 IEEE International Conference on Acoustics, Speech and
                  Signal Processing",
  title        = "Easy contextual intent prediction and slot detection",
  year         = 2013,
  pages        = "8337-8341"
}

@InProceedings{chen-2016-mm-for-slu,
  author       = "Chen, Yun-Nung Vivian and Hakkani-Tür, Dilek and Tur, Gokhan
                  and Gao, Jianfeng and Deng, Li",
  title        = "End-to-End Memory Networks with Knowledge Carryover for
                  Multi-Turn Spoken Language Understanding",
  booktitle    = "Proceedings of The 17th Annual Meeting of the International
                  Speech Communication Association (INTERSPEECH 2016)",
  year         = 2016,
  month        = "June",
  abstract     = "Spoken language understanding (SLU) is a core component of a
                  spoken dialogue system. In the traditional architecture of
                  dialogue systems, the SLU component treats each utterance
                  independent of each other, and then the following components
                  aggregate the multi-turn information in the separate
                  phases. However, there are two challenges: 1) errors from
                  previous turns may be propagated and then degrade the
                  performance of the current turn; 2) knowledge mentioned in
                  the long history may not be carried into the current
                  turn. This paper addresses the above issues by proposing an
                  architecture using end-to-end memory networks to model
                  knowledge carryover in multi-turn conversations, where
                  utterances encoded with intents and slots can be stored as
                  embeddings in the memory and the decoding phase applies an
                  attention model to leverage previously stored semantics for
                  intent prediction and slot tagging simultaneously. The
                  experiments on Microsoft Cortana conversational data show
                  that the proposed memory network architecture can effectively
                  extract salient semantics for modeling knowledge carryover in
                  the multi-turn conversations and outperform the results using
                  the state-of-the-art recurrent neural network framework (RNN)
                  designed for single-turn SLU.",
  publisher    = "ISCA",
  url          =
                  "https://www.microsoft.com/en-us/research/publication/contextualslu/",
  edition      = "Proceedings of The 17th Annual Meeting of the International
                  Speech Communication Association (INTERSPEECH 2016)"
}

@inproceedings{bapna-2017-sequential-dialogue,
  title        = "Sequential Dialogue Context Modeling for Spoken Language
                  Understanding",
  author       = "Bapna, Ankur and T{\"u}r, Gokhan and Hakkani-T{\"u}r, Dilek
                  and Heck, Larry",
  booktitle    = "Proceedings of the 18th Annual {SIG}dial Meeting on Discourse
                  and Dialogue",
  month        = aug,
  year         = 2017,
  address      = "Saarbr{\"u}cken, Germany",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-5514",
  doi          = "10.18653/v1/W17-5514",
  pages        = "103-114",
  abstract     = "Spoken Language Understanding (SLU) is a key component of
                  goal oriented dialogue systems that would parse user
                  utterances into semantic frame representations. Traditionally
                  SLU does not utilize the dialogue history beyond the previous
                  system turn and contextual ambiguities are resolved by the
                  downstream components. In this paper, we explore novel
                  approaches for modeling dialogue context in a recurrent
                  neural network (RNN) based language understanding system. We
                  propose the Sequential Dialogue Encoder Network, that allows
                  encoding context from the dialogue history in chronological
                  order. We compare the performance of our proposed
                  architecture with two context models, one that uses just the
                  previous turn context and another that encodes dialogue
                  context in a memory network, but loses the order of
                  utterances in the dialogue history. Experiments with a
                  multi-domain dialogue dataset demonstrate that the proposed
                  architecture results in reduced semantic frame error rates."
}

@article{chen-2016-k-san,
  author       = "Chen, Yun-Nung and Hakkani-Tur, Dilek and Tur, Gokhan and
                  Celikyilmaz, Asli and Gao, Jianfeng and Deng, Li",
  title        = "Knowledge As a Teacher: Knowledge-Guided Structural Attention
                  Networks",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1609.03286v1",
  abstract     = "Natural language understanding (NLU) is a core component of a
                  spoken dialogue system. Recently recurrent neural networks
                  (RNN) obtained strong results on NLU due to their superior
                  ability of preserving sequential information over time.
                  Traditionally, the NLU module tags semantic slots for
                  utterances considering their flat structures, as the
                  underlying RNN structure is a linear chain.  However, natural
                  language exhibits linguistic properties that provide rich,
                  structured information for better understanding. This paper
                  introduces a novel model, knowledge-guided structural
                  attention networks (K-SAN), a generalization of RNN to
                  additionally incorporate non-flat network topologies guided
                  by prior knowledge. There are two characteristics: 1)
                  important substructures can be captured from small training
                  data, allowing the model to generalize to previously unseen
                  test data; 2) the model automatically figures out the salient
                  substructures that are essential to predict the semantic tags
                  of the given sentences, so that the understanding performance
                  can be improved. The experiments on the benchmark Air Travel
                  Information System (ATIS) data show that the proposed K-SAN
                  architecture can effectively extract salient knowledge from
                  substructures with an attention mechanism, and outperform the
                  performance of the state-of-the-art neural network based
                  frameworks.",
  archivePrefix= "arXiv",
  eprint       = "1609.03286",
  primaryClass = "cs.AI"
}

@article{li-2017-lu-importance,
  author       = "Li, Xiujun and Chen, Yun-Nung and Li, Lihong and Gao,
                  Jianfeng and Celikyilmaz, Asli",
  title        = "Investigation of Language Understanding Impact for
                  Reinforcement Learning Based Dialogue Systems",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.07055v1",
  abstract     = "Language understanding is a key component in a spoken
                  dialogue system. In this paper, we investigate how the
                  language understanding module influences the dialogue system
                  performance by conducting a series of systematic experiments
                  on a task-oriented neural dialogue system in a reinforcement
                  learning based setting. The empirical study shows that among
                  different types of language understanding errors, slot-level
                  errors can have more impact on the overall performance of a
                  dialogue system compared to intent-level errors. In addition,
                  our experiments demonstrate that the reinforcement learning
                  based dialogue system is able to learn when and what to
                  confirm in order to achieve better performance and greater
                  robustness.",
  archivePrefix= "arXiv",
  eprint       = "1703.07055",
  primaryClass = "cs.CL"
}

@inproceedings{henderson-2013-nn-for-dst,
  title        = "Deep Neural Network Approach for the Dialog State Tracking
                  Challenge",
  author       = "Henderson, Matthew and Thomson, Blaise and Young, Steve",
  booktitle    = "Proceedings of the {SIGDIAL} 2013 Conference",
  month        = aug,
  year         = 2013,
  address      = "Metz, France",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W13-4073",
  pages        = "467-471"
}

@inproceedings{henderson-2015-ml-for-dst,
  title        = "Machine Learning for Dialog State Tracking: A Review",
  author       = "Matthew Henderson",
  year         = 2015,
  booktitle    = "Proceedings of The First International Workshop on Machine
                  Learning in Spoken Language Processing"
}

@article{mrksic-2015-rnn-for-dst,
  author       = "Mrk{\v{s}}i{\'c}, Nikola and S{\'e}aghdha, Diarmuid {\'O} and
                  Thomson, Blaise and Ga{\v{s}}i{\'c}, Milica and Su, Pei-Hao
                  and Vandyke, David and Wen, Tsung-Hsien and Young, Steve",
  title        = "Multi-Domain Dialog State Tracking Using Recurrent Neural
                  Networks",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1506.07190v1",
  abstract     = "Dialog state tracking is a key component of many modern
                  dialog systems, most of which are designed with a single,
                  well-defined domain in mind. This paper shows that dialog
                  data drawn from different dialog domains can be used to train
                  a general belief tracking model which can operate across all
                  of these domains, exhibiting superior performance to each of
                  the domain-specific models. We propose a training procedure
                  which uses out-of-domain data to initialise belief tracking
                  models for entirely new domains. This procedure leads to
                  improvements in belief tracking performance regardless of the
                  amount of in-domain data available for training the model.",
  archivePrefix= "arXiv",
  eprint       = "1506.07190",
  primaryClass = "cs.CL"
}

@article{mrksic-2016-neural-belief-tracker,
  author       = "Mrk{\v{s}}i{\'c}, Nikola and S{\'e}aghdha, Diarmuid {\'O} and
                  Wen, Tsung-Hsien and Thomson, Blaise and Young, Steve",
  title        = "Neural Belief Tracker: Data-Driven Dialogue State Tracking",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1606.03777v2",
  abstract     = "One of the core components of modern spoken dialogue systems
                  is the belief tracker, which estimates the user's goal at
                  every step of the dialogue.  However, most current approaches
                  have difficulty scaling to larger, more complex dialogue
                  domains. This is due to their dependency on either: a) Spoken
                  Language Understanding models that require large amounts of
                  annotated training data; or b) hand-crafted lexicons for
                  capturing some of the linguistic variation in users'
                  language. We propose a novel Neural Belief Tracking (NBT)
                  framework which overcomes these problems by building on
                  recent advances in representation learning. NBT models reason
                  over pre-trained word vectors, learning to compose them into
                  distributed representations of user utterances and dialogue
                  context. Our evaluation on two datasets shows that this
                  approach surpasses past limitations, matching the performance
                  of state-of-the-art models which rely on hand-crafted
                  semantic lexicons and outperforming them when such lexicons
                  are not provided.",
  archivePrefix= "arXiv",
  eprint       = "1606.03777",
  primaryClass = "cs.CL"
}

@article{shi-2017-cnn-for-dst,
  author       = "Shi, Hongjie and Ushio, Takashi and Endo, Mitsuru and
                  Yamagami, Katsuyoshi and Horii, Noriaki",
  title        = "A Multichannel Convolutional Neural Network for
                  Cross-Language Dialog State Tracking",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1701.06247v1",
  abstract     = "The fifth Dialog State Tracking Challenge (DSTC5) introduces
                  a new cross-language dialog state tracking scenario, where
                  the participants are asked to build their trackers based on
                  the English training corpus, while evaluating them with the
                  unlabeled Chinese corpus. Although the computer-generated
                  translations for both English and Chinese corpus are provided
                  in the dataset, these translations contain errors and
                  careless use of them can easily hurt the performance of the
                  built trackers. To address this problem, we propose a
                  multichannel Convolutional Neural Networks (CNN)
                  architecture, in which we treat English and Chinese language
                  as different input channels of one single CNN model. In the
                  evaluation of DSTC5, we found that such multichannel
                  architecture can effectively improve the robustness against
                  translation errors.  Additionally, our method for DSTC5 is
                  purely machine learning based and requires no prior knowledge
                  about the target language. We consider this a desirable
                  property for building a tracker in the cross-language
                  context, as not every developer will be familiar with both
                  languages.",
  archivePrefix= "arXiv",
  eprint       = "1701.06247",
  primaryClass = "cs.CL"
}

@article{nguyen-2017-kbc-overview,
  author       = "Nguyen, Dat Quoc",
  title        = "An Overview of Embedding Models of Entities and Relationships
                  for Knowledge Base Completion",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1703.08098v7",
  abstract     = "Knowledge bases (KBs) of real-world facts about entities and
                  their relationships are useful resources for a variety of
                  natural language processing tasks. However, because knowledge
                  bases are typically incomplete, it is useful to be able to
                  perform knowledge base completion or link prediction, i.e.,
                  predict whether a relationship not in the knowledge base is
                  likely to be true.  This paper serves as a comprehensive
                  overview of embedding models of entities and relationships
                  for knowledge base completion, summarizing up-to-date
                  experimental results on standard benchmark datasets.",
  archivePrefix= "arXiv",
  eprint       = "1703.08098",
  primaryClass = "cs.CL"
}

@article{li-2019-bertsel,
  author       = "Li, Dongfang and Yu, Yifei and Chen, Qingcai and Li, Xinyu",
  title        = "Bertsel: Answer Selection With Pre-Trained Models",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.07588v1",
  abstract     = "Recently, pre-trained models have been the dominant paradigm
                  in natural language processing. They achieved remarkable
                  state-of-the-art performance across a wide range of related
                  tasks, such as textual entailment, natural language
                  inference, question answering, etc. BERT, proposed by Devlin
                  et.al., has achieved a better marked result in GLUE
                  leaderboard with a deep transformer architecture. Despite its
                  soaring popularity, however, BERT has not yet been applied to
                  answer selection. This task is different from others with a
                  few nuances: first, modeling the relevance and correctness of
                  candidates matters compared to semantic relatedness and
                  syntactic structure; second, the length of an answer may be
                  different from other candidates and questions. In this paper.
                  we are the first to explore the performance of fine-tuning
                  BERT for answer selection. We achieved STOA results across
                  five popular datasets, demonstrating the success of
                  pre-trained models in this task.",
  archivePrefix= "arXiv",
  eprint       = "1905.07588",
  primaryClass = "cs.CL"
}

@inproceedings{lai-2019-gsamn,
  title        = "A Gated Self-attention Memory Network for Answer Selection",
  author       = "Lai, Tuan and Tran, Quan Hung and Bui, Trung and Kihara,
                  Daisuke",
  booktitle    = "Proceedings of the 2019 Conference on Empirical Methods in
                  Natural Language Processing and the 9th International Joint
                  Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month        = nov,
  year         = 2019,
  address      = "Hong Kong, China",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D19-1610",
  doi          = "10.18653/v1/D19-1610",
  pages        = "5953-5959",
  abstract     = "Answer selection is an important research problem, with
                  applications in many areas. Previous deep learning based
                  approaches for the task mainly adopt the Compare-Aggregate
                  architecture that performs word-level comparison followed by
                  aggregation. In this work, we take a departure from the
                  popular Compare-Aggregate architecture, and instead, propose
                  a new gated self-attention memory network for the
                  task. Combined with a simple transfer learning technique from
                  a large-scale online corpus, our model outperforms previous
                  methods by a large margin, achieving new state-of-the-art
                  results on two standard answer selection datasets: TrecQA and
                  WikiQA."
}

@article{mozafari-2019-bas,
  author       = "Mozafari, Jamshid and Fatemi, Afsaneh and Nematbakhsh,
                  Mohammad Ali",
  title        = "Bas: an Answer Selection Method Using Bert Language Model",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1911.01528v3",
  abstract     = "In recent years, Question Answering systems have become more
                  popular and widely used by users. Despite the increasing
                  popularity of these systems, the their performance is not
                  even sufficient for textual data and requires further
                  research. These systems consist of several parts that one of
                  them is the Answer Selection component. This component
                  detects the most relevant answer from a list of candidate
                  answers. The methods presented in previous researches have
                  attempted to provide an independent model to undertake the
                  answer-selection task. An independent model cannot comprehend
                  the syntactic and semantic features of questions and answers
                  with a small training dataset. To fill this gap, language
                  models can be employed in implementing the answer selection
                  part.  This action enables the model to have a better
                  understanding of the language in order to understand
                  questions and answers better than previous works. In this
                  research, we will present the `` BAS`` (BERT Answer
                  Selection) that uses the BERT language model to comprehend
                  language. The empirical results of applying the model on the
                  TrecQA Raw, TrecQA Clean, and WikiQA datasets demonstrate
                  that using a robust language model such as BERT can enhance
                  the performance. Using a more robust classifier also enhances
                  the effect of the language model on the answer selection
                  component. The results demonstrate that language
                  comprehension is an essential requirement in natural language
                  processing tasks such as answer-selection.",
  archivePrefix= "arXiv",
  eprint       = "1911.01528",
  primaryClass = "cs.CL"
}

@article{sun-2019-dream,
  author       = "Sun, Kai and Yu, Dian and Chen, Jianshu and Yu, Dong and
                  Choi, Yejin and Cardie, Claire",
  title        = "Dream: a Challenge Dataset and Models for Dialogue-Based
                  Reading Comprehension",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1902.00164v1",
  abstract     = "We present DREAM, the first dialogue-based multiple-choice
                  reading comprehension dataset. Collected from
                  English-as-a-foreign-language examinations designed by human
                  experts to evaluate the comprehension level of Chinese
                  learners of English, our dataset contains 10,197
                  multiple-choice questions for 6,444 dialogues. In contrast to
                  existing reading comprehension datasets, DREAM is the first
                  to focus on in-depth multi-turn multi-party dialogue
                  understanding. DREAM is likely to present significant
                  challenges for existing reading comprehension systems: 84 \%
                  of answers are non-extractive, 85 \% of questions require
                  reasoning beyond a single sentence, and 34 \% of questions
                  also involve commonsense knowledge.  We apply several popular
                  neural reading comprehension models that primarily exploit
                  surface information within the text and find them to, at
                  best, just barely outperform a rule-based approach. We next
                  investigate the effects of incorporating dialogue structure
                  and different kinds of general world knowledge into both
                  rule-based and (neural and non-neural) machine learning-based
                  reading comprehension models. Experimental results on the
                  DREAM dataset show the effectiveness of dialogue structure
                  and general world knowledge. DREAM will be available at
                  https://dataset.org/dream/.",
  archivePrefix= "arXiv",
  eprint       = "1902.00164",
  primaryClass = "cs.CL"
}

@article{sun-2019-c3,
  author       = "Sun, Kai and Yu, Dian and Yu, Dong and Cardie, Claire",
  title        = "Investigating Prior Knowledge for Challenging Chinese Machine
                  Reading Comprehension",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.09679v3",
  abstract     = "Machine reading comprehension tasks require a machine reader
                  to answer questions relevant to the given document. In this
                  paper, we present the first free-form multiple-Choice Chinese
                  machine reading Comprehension dataset (C^3), containing
                  13,369 documents (dialogues or more formally written
                  mixed-genre texts) and their associated 19,577
                  multiple-choice free-form questions collected from
                  Chinese-as-a-second-language examinations.  We present a
                  comprehensive analysis of the prior knowledge (i.e.,
                  linguistic, domain-specific, and general world knowledge)
                  needed for these real-world problems. We implement rule-based
                  and popular neural methods and find that there is still a
                  significant performance gap between the best performing model
                  (68.5 \%) and human readers (96.0 \%), especially on problems
                  that require prior knowledge. We further study the effects of
                  distractor plausibility and data augmentation based on
                  translated relevant datasets for English on model
                  performance. We expect C^3 to present great challenges to
                  existing systems as answering 86.8 \% of questions requires
                  both knowledge within and beyond the accompanying document,
                  and we hope that C^3 can serve as a platform to study how to
                  leverage various kinds of prior knowledge to better
                  understand a given written or orally oriented text. C^3 is
                  available at https://dataset.org/c3/.",
  archivePrefix= "arXiv",
  eprint       = "1904.09679",
  primaryClass = "cs.CL"
}

@article{yu-2020-dialogre,
  author       = "Yu, Dian and Sun, Kai and Cardie, Claire and Yu, Dong",
  title        = "Dialogue-Based Relation Extraction",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2004.08056v1",
  abstract     = "We present the first human-annotated dialogue-based relation
                  extraction (RE) dataset DialogRE, aiming to support the
                  prediction of relation(s) between two arguments that appear
                  in a dialogue. We further offer DialogRE as a platform for
                  studying cross-sentence RE as most facts span multiple
                  sentences. We argue that speaker-related information plays a
                  critical role in the proposed task, based on an analysis of
                  similarities and differences between dialogue-based and
                  traditional RE tasks. Considering the timeliness of
                  communication in a dialogue, we design a new metric to
                  evaluate the performance of RE methods in a conversational
                  setting and investigate the performance of several
                  representative RE methods on DialogRE. Experimental results
                  demonstrate that a speaker-aware extension on the
                  best-performing model leads to gains in both the standard and
                  conversational evaluation settings. DialogRE is available at
                  https://dataset.org/dialogre/.",
  archivePrefix= "arXiv",
  eprint       = "2004.08056",
  primaryClass = "cs.CL"
}

@inproceedings{ratner-2018-snorkl-metal,
  title        = "Snorkel metal: Weak supervision for multi-task learning",
  author       = "Ratner, Alex and Hancock, Braden and Dunnmon, Jared and
                  Goldman, Roger and R{\'e}, Christopher",
  booktitle    = "Proceedings of the Second Workshop on Data Management for
                  End-To-End Machine Learning",
  pages        = "1-4",
  year         = 2018
}

@article{ratner-2018-snorkl-metal-1,
  author       = "Ratner, Alexander and Hancock, Braden and Dunnmon, Jared and
                  Sala, Frederic and Pandey, Shreyash and R{\'e}, Christopher",
  title        = "Training Complex Models With Multi-Task Weak Supervision",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1810.02840v2",
  abstract     = "As machine learning models continue to increase in
                  complexity, collecting large hand-labeled training sets has
                  become one of the biggest roadblocks in practice. Instead,
                  weaker forms of supervision that provide noisier but cheaper
                  labels are often used. However, these weak supervision
                  sources have diverse and unknown accuracies, may output
                  correlated labels, and may label different tasks or apply at
                  different levels of granularity. We propose a framework for
                  integrating and modeling such weak supervision sources by
                  viewing them as labeling different related sub-tasks of a
                  problem, which we refer to as the multi-task weak supervision
                  setting. We show that by solving a matrix completion-style
                  problem, we can recover the accuracies of these multi-task
                  sources given their dependency structure, but without any
                  labeled data, leading to higher-quality supervision for
                  training an end model. Theoretically, we show that the
                  generalization error of models trained with this approach
                  improves with the number of unlabeled data points, and
                  characterize the scaling with respect to the task and
                  dependency structures. On three fine-grained classification
                  problems, we show that our approach leads to average gains of
                  20.2 points in accuracy over a traditional supervised
                  approach, 6.8 points over a majority vote baseline, and 4.1
                  points over a previously proposed weak supervision method
                  that models tasks separately.",
  archivePrefix= "arXiv",
  eprint       = "1810.02840",
  primaryClass = "stat.ML"
}

@article{gong-2017-ruminating-reader,
  author       = "Gong, Yichen and Bowman, Samuel R.",
  title        = "Ruminating Reader: Reasoning With Gated Multi-Hop Attention",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1704.07415v1",
  abstract     = "To answer the question in machine comprehension (MC) task,
                  the models need to establish the interaction between the
                  question and the context. To tackle the problem that the
                  single-pass model cannot reflect on and correct its answer,
                  we present Ruminating Reader. Ruminating Reader adds a second
                  pass of attention and a novel information fusion component to
                  the Bi-Directional Attention Flow model (BiDAF). We propose
                  novel layer structures that construct an query-aware context
                  vector representation and fuse encoding representation with
                  intermediate representation on top of BiDAF model. We show
                  that a multi-hop attention mechanism can be applied to a
                  bi-directional attention structure. In experiments on SQuAD,
                  we find that the Reader outperforms the BiDAF baseline by a
                  substantial margin, and matches or surpasses the performance
                  of all other published systems.",
  archivePrefix= "arXiv",
  eprint       = "1704.07415",
  primaryClass = "cs.CL"
}

@inproceedings{williams-2018-multinli,
  title        = "A Broad-Coverage Challenge Corpus for Sentence Understanding
                  through Inference",
  author       = "Williams, Adina and Nangia, Nikita and Bowman, Samuel",
  booktitle    = "Proceedings of the 2018 Conference of the North {A}merican
                  Chapter of the Association for Computational Linguistics:
                  Human Language Technologies, Volume 1 (Long Papers)",
  month        = jun,
  year         = 2018,
  address      = "New Orleans, Louisiana",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/N18-1101",
  doi          = "10.18653/v1/N18-1101",
  pages        = "1112-1122",
  abstract     = "This paper introduces the Multi-Genre Natural Language
                  Inference (MultiNLI) corpus, a dataset designed for use in
                  the development and evaluation of machine learning models for
                  sentence understanding. At 433k examples, this resource is
                  one of the largest corpora available for natural language
                  inference (a.k.a. recognizing textual entailment), improving
                  upon available resources in both its coverage and
                  difficulty. MultiNLI accomplishes this by offering data from
                  ten distinct genres of written and spoken English, making it
                  possible to evaluate systems on nearly the full complexity of
                  the language, while supplying an explicit setting for
                  evaluating cross-genre domain adaptation. In addition, an
                  evaluation using existing machine learning models designed
                  for the Stanford NLI corpus shows that it represents a
                  substantially more difficult task than does that corpus,
                  despite the two showing similar levels of inter-annotator
                  agreement."
}

@inproceedings{tomar-2017-decatt,
  title        = "Neural Paraphrase Identification of Questions with Noisy
                  Pretraining",
  author       = "Tomar, Gaurav Singh and Duque, Thyago and T{\"a}ckstr{\"o}m,
                  Oscar and Uszkoreit, Jakob and Das, Dipanjan",
  booktitle    = "Proceedings of the First Workshop on Subword and Character
                  Level Models in {NLP}",
  month        = sep,
  year         = 2017,
  address      = "Copenhagen, Denmark",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/W17-4121",
  doi          = "10.18653/v1/W17-4121",
  pages        = "142-147",
  abstract     = "We present a solution to the problem of paraphrase
                  identification of questions. We focus on a recent dataset of
                  question pairs annotated with binary paraphrase labels and
                  show that a variant of the decomposable attention model
                  (replacing the word embeddings of the decomposable attention
                  model of Parikh et al. 2016 with character n-gram
                  representations) results in accurate performance on this
                  task, while being far simpler than many competing neural
                  architectures. Furthermore, when the model is pretrained on a
                  noisy dataset of automatically collected question
                  paraphrases, it obtains the best reported performance on the
                  dataset."
}

@ARTICLE{wang-2020-match2,
  author       = "{Wang}, Zizhen and {Fan}, Yixing and {Guo}, Jiafeng and
                  {Yang}, Liu and {Zhang}, Ruqing and {Lan}, Yanyan and
                  {Cheng}, Xueqi and {Jiang}, Hui and {Wang}, Xiaozhao",
  title        = "{Match$^2$: A Matching over Matching Model for Similar
                  Question Identification}",
  journal      = "arXiv e-prints",
  keywords     = "Computer Science - Information Retrieval, Computer Science -
                  Computation and Language",
  year         = 2020,
  month        = jun,
  eid          = "arXiv:2006.11719",
  pages        = "arXiv:2006.11719",
  archivePrefix= "arXiv",
  eprint       = "2006.11719",
  primaryClass = "cs.IR",
  adsurl       = "https://ui.adsabs.harvard.edu/abs/2020arXiv200611719W",
  adsnote      = "Provided by the SAO/NASA Astrophysics Data System"
}

@inproceedings{gupta-2019-faq-attentive-matching,
  author       = "Gupta, Sparsh and Carvalho, Vitor R.",
  title        = "FAQ Retrieval Using Attentive Matching",
  year         = 2019,
  isbn         = 9781450361729,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/3331184.3331294",
  doi          = "10.1145/3331184.3331294",
  booktitle    = "Proceedings of the 42nd International ACM SIGIR Conference on
                  Research and Development in Information Retrieval",
  pages        = "929–932",
  numpages     = 4,
  keywords     = "neural networks, attention mechanism, learning to rank",
  location     = "Paris, France",
  series       = "SIGIR’19"
}

@inproceedings{ji-2012-qa-topic-model,
  author       = "Ji, Zongcheng and Xu, Fei and Wang, Bin and He, Ben",
  title        = "Question-Answer Topic Model for Question Retrieval in
                  Community Question Answering",
  year         = 2012,
  isbn         = 9781450311564,
  publisher    = "Association for Computing Machinery",
  address      = "New York, NY, USA",
  url          = "https://doi.org/10.1145/2396761.2398669",
  doi          = "10.1145/2396761.2398669",
  booktitle    = "Proceedings of the 21st ACM International Conference on
                  Information and Knowledge Management",
  pages        = "2471–2474",
  numpages     = 4,
  keywords     = "community question answering, question-answer topic model,
                  question retrieval, topic model, translation model",
  location     = "Maui, Hawaii, USA",
  series       = "CIKM ’12"
}

@article{sakata-2019-faq-retrieval,
  author       = "Sakata, Wataru and Shibata, Tomohide and Tanaka, Ribeka and
                  Kurohashi, Sadao",
  title        = "Faq Retrieval Using Query-Question Similarity and Bert-Based
                  Query-Answer Relevance",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.02851v2",
  abstract     = "Frequently Asked Question (FAQ) retrieval is an important
                  task where the objective is to retrieve an appropriate
                  Question-Answer (QA) pair from a database based on a user's
                  query. We propose a FAQ retrieval system that considers the
                  similarity between a user's query and a question as well as
                  the relevance between the query and an answer. Although a
                  common approach to FAQ retrieval is to construct labeled data
                  for training, it takes annotation costs.  Therefore, we use a
                  traditional unsupervised information retrieval system to
                  calculate the similarity between the query and question. On
                  the other hand, the relevance between the query and answer
                  can be learned by using QA pairs in a FAQ database. The
                  recently-proposed BERT model is used for the relevance
                  calculation. Since the number of QA pairs in FAQ page is not
                  enough to train a model, we cope with this issue by
                  leveraging FAQ sets that are similar to the one in
                  question. We evaluate our approach on two datasets. The first
                  one is localgovFAQ, a dataset we construct in a Japanese
                  administrative municipality domain. The second is
                  StackExchange dataset, which is the public dataset in
                  English. We demonstrate that our proposed method outperforms
                  baseline methods on these datasets.",
  archivePrefix= "arXiv",
  eprint       = "1905.02851",
  primaryClass = "cs.IR"
}

@InProceedings{damani-2020-optimized-transformer-faq,
  author       = "Damani, Sonam and Narahari, Kedhar Nath and Chatterjee,
                  Ankush and Gupta, Manish and Agrawal, Puneet",
  editor       = "Lauw, Hady W.  and Wong, Raymond Chi-Wing and Ntoulas,
                  Alexandros and Lim, Ee-Peng and Ng, See-Kiong and Pan, Sinno
                  Jialin",
  title        = "Optimized Transformer Models for FAQ Answering",
  booktitle    = "Advances in Knowledge Discovery and Data Mining",
  year         = 2020,
  publisher    = "Springer International Publishing",
  address      = "Cham",
  pages        = "235-248",
  abstract     = "Informational chatbots provide a highly effective medium for
                  improving operational efficiency in answering customer
                  queries for any enterprise. Chatbots are also preferred by
                  users/customers since unlike other alternatives like calling
                  customer care or browsing over FAQ pages, chatbots provide
                  instant responses, are easy to use, are less invasive and are
                  always available. In this paper, we discuss the problem of
                  FAQ answering which is central to designing a retrieval-based
                  informational chatbot. Given a set of FAQ pages s for an
                  enterprise, and a user query, we need to find the best
                  matching question-answer pairs from s. Building such a
                  semantic ranking system that works well across domains for
                  large QA databases with low runtime and model size is
                  challenging. Previous work based on feature engineering or
                  recurrent neural models either provides low accuracy or
                  incurs high runtime costs. We experiment with multiple
                  transformer based deep learning models, and also propose a
                  novel MT-DNN (Multi-task Deep Neural Network)-based
                  architecture, which we call Masked MT-DNN (or
                  MMT-DNN). MMT-DNN significantly outperforms other
                  state-of-the-art transformer models for the FAQ answering
                  task. Further, we propose an improved knowledge distillation
                  component to achieve {\$}{\$}{\backslash}sim {\$}{\$}2.4x
                  reduction in model-size and {\$}{\$}{\backslash}sim
                  {\$}{\$}7x reduction in runtime while maintaining similar
                  accuracy. On a small benchmark dataset from SemEval 2017 CQA
                  Task 3, we show that our approach provides an NDCG@1 of
                  83.1. On another large dataset of {\$}{\$}{\backslash}sim
                  {\$}{\$}281K instances corresponding to
                  {\$}{\$}{\backslash}sim {\$}{\$}30K queries from diverse
                  domains, our distilled 174 MB model provides an NDCG@1 of
                  75.08 with a CPU runtime of mere 31 ms establishing a new
                  state-of-the-art for FAQ answering.",
  isbn         = "978-3-030-47426-3"
}

@incollection{ba-2014-do-deep,
  title        = "Do Deep Nets Really Need to be Deep?",
  author       = "Ba, Jimmy and Caruana, Rich",
  booktitle    = "Advances in Neural Information Processing Systems 27",
  editor       = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
                  and K. Q. Weinberger",
  pages        = "2654-2662",
  year         = 2014,
  publisher    = "Curran Associates, Inc.",
  url          =
                  "http://papers.nips.cc/paper/5484-do-deep-nets-really-need-to-be-deep.pdf"
}

@article{mirzadeh-2019-teacher-assistant,
  author       = "Mirzadeh, Seyed-Iman and Farajtabar, Mehrdad and Li, Ang and
                  Levine, Nir and Matsukawa, Akihiro and Ghasemzadeh, Hassan",
  title        = "Improved Knowledge Distillation Via Teacher Assistant",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1902.03393v2",
  abstract     = "Despite the fact that deep neural networks are powerful
                  models and achieve appealing results on many tasks, they are
                  too large to be deployed on edge devices like smartphones or
                  embedded sensor nodes. There have been efforts to compress
                  these networks, and a popular method is knowledge
                  distillation, where a large (teacher) pre-trained network is
                  used to train a smaller (student) network. However, in this
                  paper, we show that the student network performance degrades
                  when the gap between student and teacher is large. Given a
                  fixed student network, one cannot employ an arbitrarily large
                  teacher, or in other words, a teacher can effectively
                  transfer its knowledge to students up to a certain size, not
                  smaller. To alleviate this shortcoming, we introduce
                  multi-step knowledge distillation, which employs an
                  intermediate-sized network (teacher assistant) to bridge the
                  gap between the student and the teacher.  Moreover, we study
                  the effect of teacher assistant size and extend the framework
                  to multi-step distillation. Theoretical analysis and
                  extensive experiments on CIFAR-10,100 and ImageNet datasets
                  and on CNN and ResNet architectures substantiate the
                  effectiveness of our proposed approach.",
  archivePrefix= "arXiv",
  eprint       = "1902.03393",
  primaryClass = "cs.LG"
}

@article{schulz-2002-fast-string-correction,
  title        = "Fast string correction with Levenshtein automata",
  author       = "Schulz, Klaus U and Mihov, Stoyan",
  journal      = "International Journal on Document Analysis and Recognition",
  volume       = 5,
  number       = 1,
  pages        = "67-85",
  year         = 2002,
  publisher    = "Springer"
}

@article{mihov-2004-fast-approx-search,
  title        = "Fast Approximate Search in Large Dictionaries",
  author       = "Mihov, Stoyan and Schulz, Klaus U.",
  journal      = "Computational Linguistics",
  volume       = 30,
  number       = 4,
  year         = 2004,
  url          = "https://www.aclweb.org/anthology/J04-4003",
  doi          = "10.1162/0891201042544938",
  pages        = "451-477"
}

@inproceedings{lei-2018-sru,
  title        = "Simple Recurrent Units for Highly Parallelizable Recurrence",
  author       = "Lei, Tao and Zhang, Yu and Wang, Sida I.  and Dai, Hui and
                  Artzi, Yoav",
  booktitle    = "Proceedings of the 2018 Conference on Empirical Methods in
                  Natural Language Processing",
  month        = oct # "-" # nov,
  year         = 2018,
  address      = "Brussels, Belgium",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/D18-1477",
  doi          = "10.18653/v1/D18-1477",
  pages        = "4470-4481",
  abstract     = "Common recurrent neural architectures scale poorly due to the
                  intrinsic difficulty in parallelizing their state
                  computations. In this work, we propose the Simple Recurrent
                  Unit (SRU), a light recurrent unit that balances model
                  capacity and scalability. SRU is designed to provide
                  expressive recurrence, enable highly parallelized
                  implementation, and comes with careful initialization to
                  facilitate training of deep models. We demonstrate the
                  effectiveness of SRU on multiple NLP tasks. SRU achieves
                  5{---}9x speed-up over cuDNN-optimized LSTM on classification
                  and question answering datasets, and delivers stronger
                  results than LSTM and convolutional models. We also obtain an
                  average of 0.7 BLEU improvement over the Transformer model
                  (Vaswani et al., 2017) on translation by incorporating SRU
                  into the architecture."
}
@article{xiong-2017-dcn,
  author       = "Xiong, Caiming and Zhong, Victor and Socher, Richard",
  title        = "Dcn+: Mixed Objective and Deep Residual Coattention for
                  Question Answering",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.00106v2",
  abstract     = "Traditional models for question answering optimize using
                  cross entropy loss, which encourages exact answers at the
                  cost of penalizing nearby or overlapping answers that are
                  sometimes equally accurate. We propose a mixed objective that
                  combines cross entropy loss with self-critical policy
                  learning. The objective uses rewards derived from word
                  overlap to solve the misalignment between evaluation metric
                  and optimization objective. In addition to the mixed
                  objective, we improve dynamic coattention networks (DCN) with
                  a deep residual coattention encoder that is inspired by
                  recent work in deep self-attention and residual networks. Our
                  proposals improve model performance across question types and
                  input lengths, especially for long questions that requires
                  the ability to capture long-term dependencies. On the
                  Stanford Question Answering Dataset, our model achieves
                  state-of-the-art results with 75.1 \% exact match accuracy
                  and 83.1 \% F1, while the ensemble obtains 78.9 \% exact
                  match accuracy and 86.0 \% F1.",
  archivePrefix= "arXiv",
  eprint       = "1711.00106",
  primaryClass = "cs.CL"
}

@article{andreas-2015-neural-module-networks,
  author       = "Andreas, Jacob and Rohrbach, Marcus and Darrell, Trevor and
                  Klein, Dan",
  title        = "Neural Module Networks",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1511.02799v4",
  abstract     = "Visual question answering is fundamentally compositional in
                  nature---a question like ``where is the dog?`` shares
                  substructure with questions like ``what color is the dog?``
                  and ``where is the cat?`` This paper seeks to simultaneously
                  exploit the representational capacity of deep networks and
                  the compositional linguistic structure of questions. We
                  describe a procedure for constructing and learning *neural
                  module networks*, which compose collections of
                  jointly-trained neural ``modules`` into deep networks for
                  question answering. Our approach decomposes questions into
                  their linguistic substructures, and uses these structures to
                  dynamically instantiate modular networks (with reusable
                  components for recognizing dogs, classifying colors,
                  etc.). The resulting compound networks are jointly
                  trained. We evaluate our approach on two challenging datasets
                  for visual question answering, achieving state-of-the-art
                  results on both the VQA natural image dataset and a new
                  dataset of complex questions about abstract shapes.",
  archivePrefix= "arXiv",
  eprint       = "1511.02799",
  primaryClass = "cs.CV"
}

@article{bao-2020-unilmv2,
  author       = "Bao, Hangbo and Dong, Li and Wei, Furu and Wang, Wenhui and
                  Yang, Nan and Liu, Xiaodong and Wang, Yu and Piao, Songhao
                  and Gao, Jianfeng and Zhou, Ming and Hon, Hsiao-Wuen",
  title        = "Unilmv2: Pseudo-Masked Language Models for Unified Language
                  Model Pre-Training",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2002.12804v1",
  abstract     = "We propose to pre-train a unified language model for both
                  autoencoding and partially autoregressive language modeling
                  tasks using a novel training procedure, referred to as a
                  pseudo-masked language model (PMLM). Given an input text with
                  masked tokens, we rely on conventional masks to learn
                  inter-relations between corrupted tokens and context via
                  autoencoding, and pseudo masks to learn intra-relations
                  between masked spans via partially autoregressive
                  modeling. With well-designed position embeddings and
                  self-attention masks, the context encodings are reused to
                  avoid redundant computation. Moreover, conventional masks
                  used for autoencoding provide global masking information, so
                  that all the position embeddings are accessible in partially
                  autoregressive language modeling. In addition, the two tasks
                  pre-train a unified language model as a bidirectional encoder
                  and a sequence-to-sequence decoder, respectively. Our
                  experiments show that the unified language models pre-trained
                  using PMLM achieve new state-of-the-art results on a wide
                  range of natural language understanding and generation tasks
                  across several widely used benchmarks.",
  archivePrefix= "arXiv",
  eprint       = "2002.12804",
  primaryClass = "cs.CL"
}

@article{humeau-2019-poly-encoders,
  author       = "Humeau, Samuel and Shuster, Kurt and Lachaux, Marie-Anne and
                  Weston, Jason",
  title        = "Poly-Encoders: Transformer Architectures and Pre-Training
                  Strategies for Fast and Accurate Multi-Sentence Scoring",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1905.01969v4",
  abstract     = "The use of deep pre-trained bidirectional transformers has
                  led to remarkable progress in a number of applications
                  (Devlin et al., 2018). For tasks that make pairwise
                  comparisons between sequences, matching a given input with a
                  corresponding label, two approaches are common:
                  Cross-encoders performing full self-attention over the pair
                  and Bi-encoders encoding the pair separately. The former
                  often performs better, but is too slow for practical use. In
                  this work, we develop a new transformer architecture, the
                  Poly-encoder, that learns global rather than token level
                  self-attention features. We perform a detailed comparison of
                  all three approaches, including what pre-training and
                  fine-tuning strategies work best. We show our models achieve
                  state-of-the-art results on three existing tasks; that
                  Poly-encoders are faster than Cross-encoders and more
                  accurate than Bi-encoders; and that the best results are
                  obtained by pre-training on large datasets similar to the
                  downstream tasks.",
  archivePrefix= "arXiv",
  eprint       = "1905.01969",
  primaryClass = "cs.CL"
}

@article{wang-2020-multi-level,
  author       = "Wang, Shuohang and Lan, Yunshi and Tay, Yi and Jiang, Jing
                  and Liu, Jingjing",
  title        = "Multi-Level Head-Wise Match and Aggregation in Transformer
                  for Textual Sequence Matching",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2001.07234v1",
  abstract     = "Transformer has been successfully applied to many natural
                  language processing tasks. However, for textual sequence
                  matching, simple matching between the representation of a
                  pair of sequences might bring in unnecessary noise. In this
                  paper, we propose a new approach to sequence pair matching
                  with Transformer, by learning head-wise matching
                  representations on multiple levels. Experiments show that our
                  proposed approach can achieve new state-of-the-art
                  performance on multiple tasks that rely only on pre-computed
                  sequence-vector-representation, such as SNLI, MNLI-match,
                  MNLI-mismatch, QQP, and SQuAD-binary.",
  archivePrefix= "arXiv",
  eprint       = "2001.07234",
  primaryClass = "cs.CL"
}

@article{raffel-2015-feed-forwar,
  author       = "Raffel, Colin and Ellis, Daniel P. W.",
  title        = "Feed-Forward Networks With Attention Can Solve Some Long-Term
                  Memory Problems",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1512.08756v5",
  abstract     = "We propose a simplified model of attention which is
                  applicable to feed-forward neural networks and demonstrate
                  that the resulting model can solve the synthetic ``addition``
                  and ``multiplication`` long-term memory problems for sequence
                  lengths which are both longer and more widely varying than
                  the best published results for these tasks.",
  archivePrefix= "arXiv",
  eprint       = "1512.08756",
  primaryClass = "cs.LG"
}

@article{singh-2016-black-box,
  author       = "Singh, Sameer and Ribeiro, Marco Tulio and Guestrin, Carlos",
  title        = "Programs As Black-Box Explanations",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.07579v1",
  abstract     = "Recent work in model-agnostic explanations of black-box
                  machine learning has demonstrated that interpretability of
                  complex models does not have to come at the cost of accuracy
                  or model flexibility. However, it is not clear what kind of
                  explanations, such as linear models, decision trees, and rule
                  lists, are the appropriate family to consider, and different
                  tasks and models may benefit from different kinds of
                  explanations. Instead of picking a single family of
                  representations, in this work we propose to use ``programs``
                  as model-agnostic explanations. We show that small programs
                  can be expressive yet intuitive as explanations, and
                  generalize over a number of existing interpretable families.
                  We propose a prototype program induction method based on
                  simulated annealing that approximates the local behavior of
                  black-box classifiers around a specific prediction using
                  random perturbations. Finally, we present preliminary
                  application on small datasets and show that the generated
                  explanations are intuitive and accurate for a number of
                  classifiers.",
  archivePrefix= "arXiv",
  eprint       = "1611.07579",
  primaryClass = "stat.ML"
}

@article{ribeiro-2016-nothing-else,
  author       = "Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos",
  title        = "Nothing Else Matters: Model-Agnostic Explanations By
                  Identifying Prediction Invariance",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1611.05817v1",
  abstract     = "At the core of interpretable machine learning is the question
                  of whether humans are able to make accurate predictions about
                  a model's behavior. Assumed in this question are three
                  properties of the interpretable output: coverage, precision,
                  and effort. Coverage refers to how often humans think they
                  can predict the model's behavior, precision to how accurate
                  humans are in those predictions, and effort is either the
                  up-front effort required in interpreting the model, or the
                  effort required to make predictions about a model's behavior.
                  In this work, we propose anchor-LIME (aLIME), a
                  model-agnostic technique that produces high-precision
                  rule-based explanations for which the coverage boundaries are
                  very clear. We compare aLIME to linear LIME with simulated
                  experiments, and demonstrate the flexibility of aLIME with
                  qualitative examples from a variety of domains and tasks.",
  archivePrefix= "arXiv",
  eprint       = "1611.05817",
  primaryClass = "stat.ML"
}

@article{ribeiro-2016-model-agnostic,
  author       = "Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos",
  title        = "Model-Agnostic Interpretability of Machine Learning",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1606.05386v1",
  abstract     = "Understanding why machine learning models behave the way they
                  do empowers both system designers and end-users in many ways:
                  in model selection, feature engineering, in order to trust
                  and act upon the predictions, and in more intuitive user
                  interfaces. Thus, interpretability has become a vital concern
                  in machine learning, and work in the area of interpretable
                  models has found renewed interest. In some applications, such
                  models are as accurate as non-interpretable ones, and thus
                  are preferred for their transparency. Even when they are not
                  accurate, they may still be preferred when interpretability
                  is of paramount importance. However, restricting machine
                  learning to interpretable models is often a severe
                  limitation. In this paper we argue for explaining machine
                  learning predictions using model-agnostic approaches. By
                  treating the machine learning models as black-box functions,
                  these approaches provide crucial flexibility in the choice of
                  models, explanations, and representations, improving
                  debugging, comparison, and interfaces for a variety of users
                  and models. We also outline the main challenges for such
                  methods, and review a recently-introduced model-agnostic
                  explanation approach (LIME) that addresses these challenges.",
  archivePrefix= "arXiv",
  eprint       = "1606.05386",
  primaryClass = "stat.ML"
}

@article{alvarez-melis-2018-robustness,
  author       = "Alvarez-Melis, David and Jaakkola, Tommi S.",
  title        = "On the Robustness of Interpretability Methods",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1806.08049v1",
  abstract     = "We argue that robustness of explanations---i.e., that similar
                  inputs should give rise to similar explanations---is a key
                  desideratum for interpretability.  We introduce metrics to
                  quantify robustness and demonstrate that current methods do
                  not perform well according to these metrics. Finally, we
                  propose ways that robustness can be enforced on existing
                  interpretability approaches.",
  archivePrefix= "arXiv",
  eprint       = "1806.08049",
  primaryClass = "cs.LG"
}

@inproceedings{ribeiro-2018-anchors,
  title        = "Anchors: High-Precision Model-Agnostic Explanations",
  author       = "Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin",
  booktitle    = "AAAI",
  year         = 2018
}

@article{wu-2017-beyond-sparsity,
  author       = "Wu, Mike and Hughes, Michael C. and Parbhoo, Sonali and
                  Zazzi, Maurizio and Roth, Volker and Doshi-Velez, Finale",
  title        = "Beyond Sparsity: Tree Regularization of Deep Models for
                  Interpretability",
  journal      = "CoRR",
  year         = 2017,
  url          = "http://arxiv.org/abs/1711.06178v1",
  abstract     = "The lack of interpretability remains a key barrier to the
                  adoption of deep models in many applications. In this work,
                  we explicitly regularize deep models so human users might
                  step through the process behind their predictions in little
                  time. Specifically, we train deep time-series models so their
                  class-probability predictions have high accuracy while being
                  closely modeled by decision trees with few nodes. Using
                  intuitive toy examples as well as medical tasks for treating
                  sepsis and HIV, we demonstrate that this new tree
                  regularization yields models that are easier for humans to
                  simulate than simpler L1 or L2 penalties without sacrificing
                  predictive power.",
  archivePrefix= "arXiv",
  eprint       = "1711.06178",
  primaryClass = "stat.ML"
}

@article{zhou-2015-cam,
  author       = "Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and
                  Oliva, Aude and Torralba, Antonio",
  title        = "Learning Deep Features for Discriminative Localization",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1512.04150v1",
  abstract     = "In this work, we revisit the global average pooling layer
                  proposed in [13], and shed light on how it explicitly enables
                  the convolutional neural network to have remarkable
                  localization ability despite being trained on image-level
                  labels. While this technique was previously proposed as a
                  means for regularizing training, we find that it actually
                  builds a generic localizable deep representation that can be
                  applied to a variety of tasks. Despite the apparent
                  simplicity of global average pooling, we are able to achieve
                  37.1 \% top-5 error for object localization on ILSVRC 2014,
                  which is remarkably close to the 34.2 \% top-5 error achieved
                  by a fully supervised CNN approach. We demonstrate that our
                  network is able to localize the discriminative image regions
                  on a variety of tasks despite not being trained for them",
  archivePrefix= "arXiv",
  eprint       = "1512.04150",
  primaryClass = "cs.CV"
}

@article{selvaraju-2016-grad-cam,
  author       = "Selvaraju, Ramprasaath R. and Cogswell, Michael and Das,
                  Abhishek and Vedantam, Ramakrishna and Parikh, Devi and
                  Batra, Dhruv",
  title        = "Grad-Cam: Visual Explanations From Deep Networks Via
                  Gradient-Based Localization",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1610.02391v4",
  abstract     = "We propose a technique for producing ``visual explanations``
                  for decisions from a large class of CNN-based models, making
                  them more transparent. Our approach - Gradient-weighted Class
                  Activation Mapping (Grad-CAM), uses the gradients of any
                  target concept, flowing into the final convolutional layer to
                  produce a coarse localization map highlighting important
                  regions in the image for predicting the concept. Grad-CAM is
                  applicable to a wide variety of CNN model-families: (1) CNNs
                  with fully-connected layers, (2) CNNs used for structured
                  outputs, (3) CNNs used in tasks with multimodal inputs or
                  reinforcement learning, without any architectural changes or
                  re-training. We combine Grad-CAM with fine-grained
                  visualizations to create a high-resolution
                  class-discriminative visualization and apply it to
                  off-the-shelf image classification, captioning, and visual
                  question answering (VQA) models, including ResNet-based
                  architectures. In the context of image classification models,
                  our visualizations (a) lend insights into their failure
                  modes, (b) are robust to adversarial images, (c) outperform
                  previous methods on localization, (d) are more faithful to
                  the underlying model and (e) help achieve generalization by
                  identifying dataset bias. For captioning and VQA, we show
                  that even non-attention based models can localize inputs. We
                  devise a way to identify important neurons through Grad-CAM
                  and combine it with neuron names to provide textual
                  explanations for model decisions. Finally, we design and
                  conduct human studies to measure if Grad-CAM helps users
                  establish appropriate trust in predictions from models and
                  show that Grad-CAM helps untrained users successfully discern
                  a `stronger` nodel from a `weaker` one even when both make
                  identical predictions. Our code is available at
                  https://github.com/ramprs/grad-cam/, along with a demo at
                  http://gradcam.cloudcv.org, and a video at
                  youtu.be/COjUB9Izk6E.",
  archivePrefix= "arXiv",
  eprint       = "1610.02391",
  primaryClass = "cs.CV"
}

@article{baehrens-2010-parzen,
  author       = {Baehrens, David and Schroeter, Timon and Harmeling, Stefan
                  and Kawanabe, Motoaki and Hansen, Katja and M\"{u}ller,
                  Klaus-Robert},
  title        = "How to Explain Individual Classification Decisions",
  year         = 2010,
  issue_date   = "3/1/2010",
  publisher    = "JMLR.org",
  volume       = 11,
  issn         = "1532-4435",
  journal      = "J. Mach. Learn. Res.",
  month        = aug,
  pages        = "1803–1831",
  numpages     = 29
}

@article{mcinnes-2017-hdbscan,
  title        = "hdbscan: Hierarchical density based clustering",
  author       = "McInnes, Leland and Healy, John and Astels, Steve",
  journal      = "Journal of Open Source Software",
  volume       = 2,
  number       = 11,
  pages        = 205,
  year         = 2017
}

@article{osborne-2013-data-clearning,
  title        = "Is data cleaning and the testing of assumptions relevant in
                  the 21st century?",
  author       = "Osborne, Jason W",
  journal      = "Frontiers in Psychology",
  volume       = 4,
  pages        = 370,
  year         = 2013,
  publisher    = "Frontiers"
}
@inproceedings{fu-2019-graph-rel,
  author       = "Fu, Tsu-Jui and Li, Peng-Hsuan and Ma, Wei-Yun",
  title        = "{G}raph{R}el: Modeling Text as Relational Graphs for Joint
                  Entity and Relation Extraction",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  year         = 2019,
  pages        = "1409-1418",
  doi          = "10.18653/v1/P19-1136",
  url          = "https://doi.org/10.18653/v1/P19-1136",
  abstract     = "In this paper, we present GraphRel, an end-to-end relation
                  extraction model which uses graph convolutional networks
                  (GCNs) to jointly learn named entities and relations. In
                  contrast to previous baselines, we consider the interaction
                  between named entities and relations via a 2nd-phase
                  relation-weighted GCN to better extract relations. Linear and
                  dependency structures are both used to extract both
                  sequential and regional features of the text, and a complete
                  word graph is further utilized to extract implicit features
                  among all word pairs of the text. With the graph-based
                  approach, the prediction for overlapping relations is
                  substantially improved over previous sequential
                  approaches. We evaluate GraphRel on two public datasets: NYT
                  and WebNLG. Results show that GraphRel maintains high
                  precision while increasing recall substantially. Also,
                  GraphRel outperforms previous work by 3.2{\%} and 5.8{\%} (F1
                  score), achieving a new state-of-the-art for relation
                  extraction.",
  address      = "Florence, Italy",
  month        = jul,
  publisher    = "Association for Computational Linguistics"
}

@article{xie-2019-uda,
  author       = "Xie, Qizhe and Dai, Zihang and Hovy, Eduard and Luong,
                  Minh-Thang and Le, Quoc V.",
  title        = "Unsupervised Data Augmentation for Consistency Training",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1904.12848v5",
  abstract     = "Semi-supervised learning lately has shown much promise in
                  improving deep learning models when labeled data is
                  scarce. Common among recent approaches is the use of
                  consistency training on a large amount of unlabeled data to
                  constrain model predictions to be invariant to input
                  noise. In this work, we present a new perspective on how to
                  effectively noise unlabeled examples and argue that the
                  quality of noising, specifically those produced by advanced
                  data augmentation methods, plays a crucial role in
                  semi-supervised learning. By substituting simple noising
                  operations with advanced data augmentation methods such as
                  RandAugment and back-translation, our method brings
                  substantial improvements across six language and three vision
                  tasks under the same consistency training framework. On the
                  IMDb text classification dataset, with only 20 labeled
                  examples, our method achieves an error rate of 4.20,
                  outperforming the state-of-the-art model trained on 25,000
                  labeled examples. On a standard semi-supervised learning
                  benchmark, CIFAR-10, our method outperforms all previous
                  approaches and achieves an error rate of 5.43 with only 250
                  examples. Our method also combines well with transfer
                  learning, e.g., when finetuning from BERT, and yields
                  improvements in high-data regime, such as ImageNet, whether
                  when there is only 10 \% labeled data or when a full labeled
                  set with 1.3M extra unlabeled examples is used. Code is
                  available at https://github.com/google-research/uda.",
  archivePrefix= "arXiv",
  eprint       = "1904.12848",
  primaryClass = "cs.LG"
}

@article{cubuk-2019-randaugment,
  author       = "Cubuk, Ekin D. and Zoph, Barret and Shlens, Jonathon and Le,
                  Quoc V.",
  title        = "Randaugment: Practical Automated Data Augmentation With a
                  Reduced Search Space",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1909.13719v2",
  abstract     = "Recent work has shown that data augmentation has the
                  potential to significantly improve the generalization of deep
                  learning models. Recently, automated augmentation strategies
                  have led to state-of-the-art results in image classification
                  and object detection. While these strategies were optimized
                  for improving validation accuracy, they also led to
                  state-of-the-art results in semi-supervised learning and
                  improved robustness to common corruptions of images. An
                  obstacle to a large-scale adoption of these methods is a
                  separate search phase which increases the training complexity
                  and may substantially increase the computational
                  cost. Additionally, due to the separate search phase, these
                  approaches are unable to adjust the regularization strength
                  based on model or dataset size. Automated augmentation
                  policies are often found by training small models on small
                  datasets and subsequently applied to train larger models. In
                  this work, we remove both of these obstacles. RandAugment has
                  a significantly reduced search space which allows it to be
                  trained on the target task with no need for a separate proxy
                  task. Furthermore, due to the parameterization, the
                  regularization strength may be tailored to different model
                  and dataset sizes. RandAugment can be used uniformly across
                  different tasks and datasets and works out of the box,
                  matching or surpassing all previous automated augmentation
                  approaches on CIFAR-10/100, SVHN, and ImageNet.  On the
                  ImageNet dataset we achieve 85.0 \% accuracy, a 0.6 \%
                  increase over the previous state-of-the-art and 1.0 \%
                  increase over baseline augmentation. On object detection,
                  RandAugment leads to 1.0-1.3 \% improvement over baseline
                  augmentation, and is within 0.3 \% mAP of AutoAugment on
                  COCO. Finally, due to its interpretable hyperparameter,
                  RandAugment may be used to investigate the role of data
                  augmentation with varying model and dataset size. Code is
                  available online.",
  archivePrefix= "arXiv",
  eprint       = "1909.13719",
  primaryClass = "cs.CV"
}

@article{pan-2020-adversarial-validation,
  author       = "Pan, Jing and Pham, Vincent and Dorairaj, Mohan and Chen,
                  Huigang and Lee, Jeong-Yoon",
  title        = "Adversarial Validation Approach To Concept Drift Problem in
                  User Targeting Automation Systems At Uber",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2004.03045v2",
  abstract     = "In user targeting automation systems, concept drift in input
                  data is one of the main challenges. It deteriorates model
                  performance on new data over time.  Previous research on
                  concept drift mostly proposed model retraining after
                  observing performance decreases. However, this approach is
                  suboptimal because the system fixes the problem only after
                  suffering from poor performance on new data. Here, we
                  introduce an adversarial validation approach to concept drift
                  problems in user targeting automation systems. With our
                  approach, the system detects concept drift in new data before
                  making inference, trains a model, and produces predictions
                  adapted to the new data. We show that our approach addresses
                  concept drift effectively with the AutoML3 Lifelong Machine
                  Learning challenge data as well as in Uber's internal user
                  targeting automation system, MaLTA.",
  archivePrefix= "arXiv",
  eprint       = "2004.03045",
  primaryClass = "cs.LG"
}

@article{lin-2019-unknown-detection,
  author       = "Lin, Ting-En and Xu, Hua",
  title        = "Deep Unknown Intent Detection With Margin Loss",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1906.00434v1",
  abstract     = "Identifying the unknown (novel) user intents that have never
                  appeared in the training set is a challenging task in the
                  dialogue system. In this paper, we present a two-stage method
                  for detecting unknown intents. We use bidirectional long
                  short-term memory (BiLSTM) network with the margin loss as
                  the feature extractor. With margin loss, we can learn
                  discriminative deep features by forcing the network to
                  maximize inter-class variance and to minimize intra-class
                  variance. Then, we feed the feature vectors to the
                  density-based novelty detection algorithm, local outlier
                  factor (LOF), to detect unknown intents. Experiments on two
                  benchmark datasets show that our method can yield consistent
                  improvements compared with the baseline methods.",
  archivePrefix= "arXiv",
  eprint       = "1906.00434",
  primaryClass = "cs.CL"
}

@article{tompson-2014-spatial-dropout,
  author       = "Tompson, Jonathan and Goroshin, Ross and Jain, Arjun and
                  LeCun, Yann and Bregler, Christopher",
  title        = "Efficient Object Localization Using Convolutional Networks",
  journal      = "CoRR",
  year         = 2014,
  url          = "http://arxiv.org/abs/1411.4280v3",
  abstract     = "Recent state-of-the-art performance on human-body pose
                  estimation has been achieved with Deep Convolutional Networks
                  (ConvNets). Traditional ConvNet architectures include pooling
                  and sub-sampling layers which reduce computational
                  requirements, introduce invariance and prevent over-training.
                  These benefits of pooling come at the cost of reduced
                  localization accuracy. We introduce a novel architecture
                  which includes an efficient `position refinement' model that
                  is trained to estimate the joint offset location within a
                  small region of the image. This refinement model is jointly
                  trained in cascade with a state-of-the-art ConvNet model to
                  achieve improved accuracy in human joint location
                  estimation. We show that the variance of our detector
                  approaches the variance of human annotations on the FLIC
                  dataset and outperforms all existing approaches on the
                  MPII-human-pose dataset.",
  archivePrefix= "arXiv",
  eprint       = "1411.4280",
  primaryClass = "cs.CV"
}

@article{yang-2018-rethinking-structure,
  author       = "Yang, Yao-Yuan and Lin, Yi-An and Chu, Hong-Min and Lin,
                  Hsuan-Tien",
  title        = "Deep Learning With a Rethinking Structure for Multi-Label
                  Classification",
  journal      = "CoRR",
  year         = 2018,
  url          = "http://arxiv.org/abs/1802.01697v2",
  abstract     = "Multi-label classification (MLC) is an important class of
                  machine learning problems that come with a wide spectrum of
                  applications, each demanding a possibly different evaluation
                  criterion. When solving the MLC problems, we generally expect
                  the learning algorithm to take the hidden correlation of the
                  labels into account to improve the prediction
                  performance. Extracting the hidden correlation is generally a
                  challenging task. In this work, we propose a novel deep
                  learning framework to better extract the hidden correlation
                  with the help of the memory structure within recurrent neural
                  networks. The memory stores the temporary guesses on the
                  labels and effectively allows the framework to rethink about
                  the goodness and correlation of the guesses before making the
                  final prediction. Furthermore, the rethinking process makes
                  it easy to adapt to different evaluation criteria to match
                  real-world application needs. In particular, the framework
                  can be trained in an end-to-end style with respect to any
                  given MLC evaluation criteria. The end-to-end design can be
                  seamlessly combined with other deep learning techniques to
                  conquer challenging MLC problems like image
                  tagging. Experimental results across many real-world data
                  sets justify that the rethinking framework indeed improves
                  MLC performance across different evaluation criteria and
                  leads to superior performance over state-of-the-art MLC
                  algorithms.",
  archivePrefix= "arXiv",
  eprint       = "1802.01697",
  primaryClass = "cs.LG"
}

@inproceedings{yang-2019-seq2set,
  author       = "Yang, Pengcheng and Luo, Fuli and Ma, Shuming and Lin,
                  Junyang and Sun, Xu",
  title        = "A Deep Reinforced Sequence-to-Set Model for Multi-Label
                  Classification",
  booktitle    = "Proceedings of the 57th Annual Meeting of the Association for
                  Computational Linguistics",
  year         = 2019,
  pages        = "5252-5258",
  doi          = "10.18653/v1/P19-1518",
  url          = "https://doi.org/10.18653/v1/P19-1518",
  abstract     = "Multi-label classification (MLC) aims to predict a set of
                  labels for a given instance. Based on a pre-defined label
                  order, the sequence-to-sequence (Seq2Seq) model trained via
                  maximum likelihood estimation method has been successfully
                  applied to the MLC task and shows powerful ability to capture
                  high-order correlations between labels. However, the output
                  labels are essentially an unordered set rather than an
                  ordered sequence. This inconsistency tends to result in some
                  intractable problems, e.g., sensitivity to the label
                  order. To remedy this, we propose a simple but effective
                  sequence-to-set model. The proposed model is trained via
                  reinforcement learning, where reward feedback is designed to
                  be independent of the label order. In this way, we can reduce
                  the dependence of the model on the label order, as well as
                  capture high-order correlations between labels. Extensive
                  experiments show that our approach can substantially
                  outperform competitive baselines, as well as effectively
                  reduce the sensitivity to the label order.",
  address      = "Florence, Italy",
  month        = jul,
  publisher    = "Association for Computational Linguistics"
}

@ARTICLE{zhu-2018-label-correlation,
  author       = "Y. {Zhu} and J. T. {Kwok} and Z. {Zhou}",
  journal      = "IEEE Transactions on Knowledge and Data Engineering",
  title        = "Multi-Label Learning with Global and Local Label Correlation",
  year         = 2018,
  volume       = 30,
  number       = 6,
  pages        = "1081-1094"
}

@article{garg-2015-exploring-correlation,
  author       = "Garg, Amit and Noyola, Jonathan and Verma, Romil and Saxena,
                  Ashutosh and Jami, Aditya",
  title        = "Exploring Correlation Between Labels To Improve Multi-Label
                  Classification",
  journal      = "CoRR",
  year         = 2015,
  url          = "http://arxiv.org/abs/1511.07953v1",
  abstract     = "This paper attempts multi-label classification by extending
                  the idea of independent binary classification models for each
                  output label, and exploring how the inherent correlation
                  between output labels can be used to improve
                  predictions. Logistic Regression, Naive Bayes, Random Forest,
                  and SVM models were constructed, with SVM giving the best
                  results: an improvement of 12.9\% over binary models was
                  achieved for hold out cross validation by augmenting with
                  pairwise correlation probabilities of the labels.",
  archivePrefix= "arXiv",
  eprint       = "1511.07953",
  primaryClass = "cs.LG"
}

@inproceedings{huang-2012-multi-label,
  author       = "Huang, Sheng-Jun and Zhou, Zhi-Hua",
  title        = "Multi-Label Learning by Exploiting Label Correlations
                  Locally",
  year         = 2012,
  publisher    = "AAAI Press",
  abstract     = "It is well known that exploiting label correlations is
                  important for multi-label learning. Existing approaches
                  typically exploit label correlations globally, by assuming
                  that the label correlations are shared by all the
                  instances. In real-world tasks, however, different instances
                  may share different label correlations, and few correlations
                  are globally applicable. In this paper, we propose the ML-LOC
                  approach which allows label correlations to be exploited
                  locally. To encode the local influence of label correlations,
                  we derive a LOC code to enhance the feature representation of
                  each instance. The global discrimination fitting and local
                  correlation sensitivity are incorporated into a unified
                  framework, and an alternating solution is developed for the
                  optimization. Experimental results on a number of image, text
                  and gene data sets validate the effectiveness of our
                  approach.",
  booktitle    = "Proceedings of the Twenty-Sixth AAAI Conference on Artificial
                  Intelligence",
  pages        = "949–955",
  numpages     = 7,
  location     = "Toronto, Ontario, Canada",
  series       = "AAAI'12"
}

@article{li-2014-condensed-filter-tree,
  author       = "Chun-Liang Li and Hsuan-Tien Lin",
  year         = 2014,
  month        = 01,
  pages        = "663-673",
  title        = "Condensed filter tree for cost-sensitive multi-label
                  classification",
  volume       = 1,
  journal      = "31st International Conference on Machine Learning, ICML 2014"
}

@incollection{nam-2017-maximing-subset-accuracy,
  title        = "Maximizing Subset Accuracy with Recurrent Neural Networks in
                  Multi-label Classification",
  author       = {Nam, Jinseok and Loza Menc\'{\i}a, Eneldo and Kim, Hyunwoo J
                  and F\"{u}rnkranz, Johannes},
  booktitle    = "Advances in Neural Information Processing Systems 30",
  editor       = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
                  R. Fergus and S. Vishwanathan and R. Garnett",
  pages        = "5413-5423",
  year         = 2017,
  publisher    = "Curran Associates, Inc.",
  url          =
                  "http://papers.nips.cc/paper/7125-maximizing-subset-accuracy-with-recurrent-neural-networks-in-multi-label-classification.pdf"
}

@article{rennie-2016-self-critical,
  author       = "Rennie, Steven J. and Marcheret, Etienne and Mroueh, Youssef
                  and Ross, Jarret and Goel, Vaibhava",
  title        = "Self-Critical Sequence Training for Image Captioning",
  journal      = "CoRR",
  year         = 2016,
  url          = "http://arxiv.org/abs/1612.00563v2",
  abstract     = "Recently it has been shown that policy-gradient methods for
                  reinforcement learning can be utilized to train deep
                  end-to-end systems directly on non-differentiable metrics for
                  the task at hand. In this paper we consider the problem of
                  optimizing image captioning systems using reinforcement
                  learning, and show that by carefully optimizing our systems
                  using the test metrics of the MSCOCO task, significant gains
                  in performance can be realized. Our systems are built using a
                  new optimization approach that we call self-critical sequence
                  training (SCST). SCST is a form of the popular REINFORCE
                  algorithm that, rather than estimating a ``baseline`` to
                  normalize the rewards and reduce variance, utilizes the
                  output of its own test-time inference algorithm to normalize
                  the rewards it experiences. Using this approach, estimating
                  the reward signal (as actor-critic methods must do) and
                  estimating normalization (as REINFORCE algorithms typically
                  do) is avoided, while at the same time harmonizing the model
                  with respect to its test-time inference
                  procedure. Empirically we find that directly optimizing the
                  CIDEr metric with SCST and greedy decoding at test-time is
                  highly effective. Our results on the MSCOCO evaluation sever
                  establish a new state-of-the-art on the task, improving the
                  best result in terms of CIDEr from 104.9 to 114.7.",
  archivePrefix= "arXiv",
  eprint       = "1612.00563",
  primaryClass = "cs.LG"
}

@techreport{settles-2009-active-learning,
  title        = "Active learning literature survey",
  author       = "Settles, Burr",
  year         = 2009,
  institution  = "University of Wisconsin-Madison Department of Computer
                  Sciences"
}

@incollection{aggarwal-2014-active-learning,
  title        = "Active learning: A survey",
  author       = "Aggarwal, Charu C and Kong, Xiangnan and Gu, Quanquan and
                  Han, Jiawei and Philip, S Yu",
  booktitle    = "Data Classification: Algorithms and Applications",
  pages        = "571-605",
  year         = 2014,
  publisher    = "CRC Press"
}
@article{tang-2019-distilling-bert,
  author       = "Tang, Raphael and Lu, Yao and Liu, Linqing and Mou, Lili and
                  Vechtomova, Olga and Lin, Jimmy",
  title        = "Distilling Task-Specific Knowledge From Bert Into Simple
                  Neural Networks",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1903.12136v1",
  abstract     = "In the natural language processing literature, neural
                  networks are becoming increasingly deeper and complex. The
                  recent poster child of this trend is the deep language
                  representation model, which includes BERT, ELMo, and
                  GPT. These developments have led to the conviction that
                  previous-generation, shallower neural networks for language
                  understanding are obsolete. In this paper, however, we
                  demonstrate that rudimentary, lightweight neural networks can
                  still be made competitive without architecture changes,
                  external training data, or additional input features. We
                  propose to distill knowledge from BERT, a state-of-the-art
                  language representation model, into a single-layer BiLSTM, as
                  well as its siamese counterpart for sentence-pair
                  tasks. Across multiple datasets in paraphrasing, natural
                  language inference, and sentiment classification, we achieve
                  comparable results with ELMo, while using roughly 100 times
                  fewer parameters and 15 times less inference time.",
  archivePrefix= "arXiv",
  eprint       = "1903.12136",
  primaryClass = "cs.CL"
}

@article{tay-2020-efficient-transformers,
  author       = "Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler,
                  Donald",
  title        = "Efficient Transformers: a Survey",
  journal      = "CoRR",
  year         = 2020,
  url          = "http://arxiv.org/abs/2009.06732v1",
  abstract     = "Transformer model architectures have garnered immense
                  interest lately due to their effectiveness across a range of
                  domains like language, vision and reinforcement learning. In
                  the field of natural language processing for example,
                  Transformers have become an indispensable staple in the
                  modern deep learning stack. Recently, a dizzying number of
                  ``X-former`` models have been proposed - Reformer, Linformer,
                  Performer, Longformer, to name a few - which improve upon the
                  original Transformer architecture, many of which make
                  improvements around computational and memory efficiency. With
                  the aim of helping the avid researcher navigate this flurry,
                  this paper characterizes a large and thoughtful selection of
                  recent efficiency-flavored ``X-former`` models, providing an
                  organized and comprehensive overview of existing work and
                  models across multiple domains.",
  archivePrefix= "arXiv",
  eprint       = "2009.06732",
  primaryClass = "cs.LG"
}

@article{wei-2019-casrel,
  author       = "Wei, Zhepei and Su, Jianlin and Wang, Yue and Tian, Yuan and
                  Chang, Yi",
  title        = "A Novel Cascade Binary Tagging Framework for Relational
                  Triple Extraction",
  journal      = "CoRR",
  year         = 2019,
  url          = "http://arxiv.org/abs/1909.03227v4",
  abstract     = "Extracting relational triples from unstructured text is
                  crucial for large-scale knowledge graph
                  construction. However, few existing works excel in solving
                  the overlapping triple problem where multiple relational
                  triples in the same sentence share the same entities. In this
                  work, we introduce a fresh perspective to revisit the
                  relational triple extraction task and propose a novel cascade
                  binary tagging framework (CasRel) derived from a principled
                  problem formulation. Instead of treating relations as
                  discrete labels as in previous works, our new framework
                  models relations as functions that map subjects to objects in
                  a sentence, which naturally handles the overlapping
                  problem. Experiments show that the CasRel framework already
                  outperforms state-of-the-art methods even when its encoder
                  module uses a randomly initialized BERT encoder, showing the
                  power of the new tagging framework. It enjoys further
                  performance boost when employing a pre-trained BERT encoder,
                  outperforming the strongest baseline by 17.5 and 30.2
                  absolute gain in F1-score on two public datasets NYT and
                  WebNLG, respectively. In-depth analysis on different
                  scenarios of overlapping triples shows that the method
                  delivers consistent performance gain across all these
                  scenarios. The source code and data are released online.",
  archivePrefix= "arXiv",
  eprint       = "1909.03227",
  primaryClass = "cs.CL"
}

@inproceedings{ma-2020-simple-lexicon,
  title        = "Simplify the Usage of Lexicon in {C}hinese {NER}",
  author       = "Ma, Ruotian and Peng, Minlong and Zhang, Qi and Wei, Zhongyu
                  and Huang, Xuanjing",
  booktitle    = "Proceedings of the 58th Annual Meeting of the Association for
                  Computational Linguistics",
  month        = jul,
  year         = 2020,
  address      = "Online",
  publisher    = "Association for Computational Linguistics",
  url          = "https://www.aclweb.org/anthology/2020.acl-main.528",
  doi          = "10.18653/v1/2020.acl-main.528",
  pages        = "5951-5960",
  abstract     = "Recently, many works have tried to augment the performance of
                  Chinese named entity recognition (NER) using word
                  lexicons. As a representative, Lattice-LSTM has achieved new
                  benchmark results on several public Chinese NER
                  datasets. However, Lattice-LSTM has a complex model
                  architecture. This limits its application in many industrial
                  areas where real-time NER response