Skip to content

Instantly share code, notes, and snippets.

@failable
Created October 17, 2020 04:01
Show Gist options
  • Save failable/ea8008b52901ab74da6f860002403270 to your computer and use it in GitHub Desktop.
Save failable/ea8008b52901ab74da6f860002403270 to your computer and use it in GitHub Desktop.
references.bib
This file has been truncated, but you can view the full file.
@inproceedings{turian-2010-word-repres,
title = "Word representations: a simple and general method for
semi-supervised learning",
author = "Turian, Joseph and Ratinov, Lev and Bengio, Yoshua",
booktitle = "Proceedings of the 48th annual meeting of the association for
computational linguistics",
abstract = "If we take an existing supervised NLP sys- tem, a simple and
general way to improve accuracy is to use unsupervised word
representations as extra word features. We evaluate Brown
clusters, Collobert and Weston (2008) embeddings, and HLBL
(Mnih \& Hinton, 2009) embeddings of words on both NER and
chunking. We use near state-of-the-art supervised baselines,
and find that each of the three word representations improves
the accu- racy of these baselines. We find further
improvements by combining diðerent word representations. You
can download our word features, for oð-the-shelf use in
existing NLP systems, as well as our code, here:
\url{http://metaoptimize.com/projects/wordreprs}",
pages = "384-394",
year = 2010,
organization = "Association for Computational Linguistics"
}
@inproceedings{levy-2014-depend-based,
title = "Dependency-Based Word Embeddings.",
author = "Levy, Omer and Goldberg, Yoav",
booktitle = "ACL (2)",
pages = "302-308",
year = 2014
}
@article{bengio-2008-neural-net,
title = "Neural net language models",
author = "Bengio, Yoshua",
journal = "Scholarpedia",
volume = 3,
number = 1,
pages = 3881,
year = 2008
}
@article{sahlgren-2006-the-word-space-model,
title = {The Word-Space Model: Using distributional analysis to
represent syntagmatic and paradigmatic relations between words
in high-dimensional vector spaces},
author = {Sahlgren, Magnus},
year = {2006},
publisher = {Institutionen for lingvistik}
}
@inproceedings{pereira-1993-dist-cluster,
title = "Distributional clustering of English words",
author = "Pereira, Fernando and Tishby, Naftali and Lee, Lillian",
booktitle = "Proceedings of the 31st annual meeting on Association for
Computational Linguistics",
pages = "183-190",
year = 1993,
organization = "Association for Computational Linguistics"
}
@article{freund-1999-short,
title = "A short introduction to boosting",
author = "Freund, Yoav and Schapire, Robert and Abe, N",
journal = "Journal-Japanese Society For Artificial Intelligence",
volume = 14,
number = "771-780",
pages = 1612,
year = 1999,
publisher = "JAPANESE SOC ARTIFICIAL INTELL"
}
@inproceedings{caruana-2006-empirical-compari,
title = "An empirical comparison of supervised learning algorithms",
author = "Caruana, Rich and Niculescu-Mizil, Alexandru",
booktitle = "Proceedings of the 23rd international conference on Machine
learning",
pages = "161-168",
year = 2006,
organization = "ACM"
}
@article{natekin-2013-gradient-boosting,
title = "Gradient boosting machines, a tutorial",
author = "Natekin, Alexey and Knoll, Alois",
journal = "Frontiers in neurorobotics",
volume = 7,
year = 2013,
publisher = "Frontiers Media SA",
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3885826/"
}
@article{loh-2011-classification-regression,
title = "Classification and regression trees",
author = "Loh, Wei-Yin",
journal = "Wiley Interdisciplinary Reviews: Data Mining and Knowledge
Discovery",
volume = 1,
number = 1,
pages = "14-23",
year = 2011,
publisher = "Wiley Online Library"
}
@inproceedings{chen-2015-higgs-boson,
title = "Higgs boson discovery with boosted trees",
author = "Chen, Tianqi and He, Tong",
booktitle = "Cowan et al., editor, JMLR: Workshop and Conference
Proceedings",
number = 42,
pages = "69-80",
year = 2015
}
@inproceedings{gutmann-2010-noise-contra-estima,
title = "Noise-contrastive estimation: A new estimation principle for
unnormalized statistical models.",
author = "Gutmann, Michael and Hyv{\"a}rinen, Aapo",
booktitle = "AISTATS",
volume = 1,
number = 2,
pages = 6,
year = 2010
}
@phdthesis{sutskever-2013-training-recurrent,
title = "Training recurrent neural networks",
author = "Sutskever, Ilya",
year = 2013,
school = "University of Toronto"
}
@inproceedings{szegedy-2015-going-deeper,
title = "Going deeper with convolutions",
author = "Szegedy, Christian and Liu, Wei and Jia, Yangqing and
Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and
Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew",
booktitle = "Proceedings of the IEEE Conference on Computer Vision and
Pattern Recognition",
pages = "1-9",
year = 2015
}
@inproceedings{krizhevsky-2012-imagenet-classification,
title = "Imagenet classification with deep convolutional neural
networks",
author = "Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E",
booktitle = "Advances in neural information processing systems",
pages = "1097-1105",
year = 2012
}
@article{he-2009-learning-from-imbalanced,
title = "Learning from imbalanced data",
author = "He, Haibo and Garcia, Edwardo A",
journal = "IEEE Transactions on knowledge and data engineering",
volume = 21,
number = 9,
pages = "1263-1284",
year = 2009,
publisher = "IEEE"
}
@article{zaremba-2015-empir-explor,
author = {Zaremba, Wojciech},
title = {An Empirical Exploration of Recurrent Network Architectures},
year = {2015},
}
@article{friedman-2001-greedy-func-approx,
title = "Greedy function approximation: a gradient boosting machine",
author = "Friedman, Jerome H",
journal = "Annals of statistics",
pages = "1189-1232",
year = 2001,
publisher = "JSTOR"
}
@article{friedman-2002-stochastic-gradient-boost,
title = "Stochastic gradient boosting",
author = "Friedman, Jerome H",
journal = "Computational Statistics \& Data Analysis",
volume = 38,
number = 4,
pages = "367-378",
year = 2002,
publisher = "Elsevier"
}
@article{friedman-2000-additive-logistic-regression,
title = "Additive logistic regression: a statistical view of boosting
(with discussion and a rejoinder by the authors)",
author = "Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert
and others",
journal = "The annals of statistics",
volume = 28,
number = 2,
pages = "337-407",
year = 2000,
publisher = "Institute of Mathematical Statistics"
}
@inproceedings{greenwald-2001-space-efficient-online,
title = "Space-efficient online computation of quantile summaries",
author = "Greenwald, Michael and Khanna, Sanjeev",
booktitle = "ACM SIGMOD Record",
volume = 30,
number = 2,
pages = "58-66",
year = 2001,
organization = "ACM"
}
@inproceedings{zhang-2007-fast-algorithm,
title = "A fast algorithm for approximate quantiles in high speed data
streams",
author = "Zhang, Qi and Wang, Wei",
booktitle = "Scientific and Statistical Database Management,
2007. SSBDM'07. 19th International Conference on",
pages = "29-29",
year = 2007,
organization = "IEEE"
}
@article{greenwald-2016-quant-equid,
author = "Greenwald, Michael B and Khanna, Sanjeev",
title = "Quantiles and Equidepth Histograms Over Streams",
journal = "In Data Stream Management: Processing High-Speed Data
Streams. Springer",
year = 2016,
publisher = "Citeseer"
}
@ARTICLE{goldberg-2014-explain,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1402.3722G",
archivePrefix= "arXiv",
author = "{Goldberg}, Y. and {Levy}, O.",
eprint = "1402.3722",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.CL",
title = "{word2vec Explained: Deriving Mikolov Et Al.'s
Negative-Sampling Word-Embedding method}",
year = 2014
}
@ARTICLE{turney-2010-from-frequen-to-meanin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2010arXiv1003.1141T",
archivePrefix= "arXiv",
author = "{Turney}, P.~D. and {Pantel}, P.",
eprint = "1003.1141",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Information Retrieval, Computer Science - Learning, H.3.1, I.2.6, I.2.7",
month = mar,
primaryClass = "cs.CL",
title = "{From Frequency To Meaning: Vector Space Models of
Semantics}",
year = 2010
}
@ARTICLE{zaremba-2014-recur-neural-networ-regul,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1409.2329Z",
archivePrefix= "arXiv",
author = "{Zaremba}, W. and {Sutskever}, I. and {Vinyals}, O.",
eprint = "1409.2329",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing",
month = sep,
title = "{Recurrent Neural Network Regularization}",
year = 2014
}
@ARTICLE{cho-2014-encoder-decoder,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1406.1078C",
archivePrefix= "arXiv",
author = "{Cho}, K. and {van Merrienboer}, B. and {Gulcehre}, C. and
{Bahdanau}, D. and {Bougares}, F. and {Schwenk}, H. and
{Bengio}, Y.",
eprint = "1406.1078",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing, Statistics
- Machine Learning",
month = jun,
primaryClass = "cs.CL",
title = "{Learning Phrase Representations Using Rnn Encoder-Decoder
for Statistical Machine Translation}",
year = 2014
}
@ARTICLE{sutskever-2014-seq2seq,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1409.3215S",
archivePrefix= "arXiv",
author = "{Sutskever}, I. and {Vinyals}, O. and {Le}, Q.~V.",
eprint = "1409.3215",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = sep,
primaryClass = "cs.CL",
title = "{Sequence To Sequence Learning With Neural Networks}",
year = 2014
}
@ARTICLE{bengio-2015-schedule-sampling,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150603099B",
archivePrefix= "arXiv",
author = "{Bengio}, S. and {Vinyals}, O. and {Jaitly}, N. and
{Shazeer}, N.",
eprint = "1506.03099",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Computer Science - Computer Vision and Pattern
Recognition",
month = jun,
primaryClass = "cs.LG",
title = "{Scheduled Sampling for Sequence Prediction With Recurrent
Neural Networks}",
year = 2015
}
@ARTICLE{jean-2014-using-very,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.2007J",
archivePrefix= "arXiv",
author = "{Jean}, S. and {Cho}, K. and {Memisevic}, R. and {Bengio},
Y.",
eprint = "1412.2007",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = dec,
primaryClass = "cs.CL",
title = "{On Using Very Large Target Vocabulary for Neural Machine
Translation}",
year = 2014
}
@ARTICLE{chen-2016-xgboos,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160302754C",
archivePrefix= "arXiv",
author = "{Chen}, T. and {Guestrin}, C.",
eprint = "1603.02754",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = mar,
primaryClass = "cs.LG",
title = "{XGBoost: A Scalable Tree Boosting System}",
year = 2016
}
@ARTICLE{kawaguchi-2016-deep-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160507110K",
archivePrefix= "arXiv",
author = "{Kawaguchi}, K.",
eprint = "1605.07110",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning,
Mathematics - Optimization and Control",
month = may,
primaryClass = "stat.ML",
title = "{Deep Learning Without Poor Local Minima}",
year = 2016
}
@ARTICLE{ruder-2016-overv-gradien,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160904747R",
archivePrefix= "arXiv",
author = "{Ruder}, S.",
eprint = "1609.04747",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = sep,
primaryClass = "cs.LG",
title = "{An Overview of Gradient Descent Optimization algorithms}",
year = 2016
}
@ARTICLE{zeiler-2012-adadel,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1212.5701Z",
archivePrefix= "arXiv",
author = "{Zeiler}, M.~D.",
eprint = "1212.5701",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = dec,
primaryClass = "cs.LG",
title = "{ADADELTA: An Adaptive Learning Rate Method}",
year = 2012
}
@ARTICLE{bengio-2012-advan-optim-recur-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1212.0901B",
archivePrefix= "arXiv",
author = "{Bengio}, Y. and {Boulanger-Lewandowski}, N. and {Pascanu},
R.",
eprint = "1212.0901",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = dec,
primaryClass = "cs.LG",
title = "{Advances in Optimizing Recurrent Networks}",
year = 2012
}
@ARTICLE{he-2015-deep-resid,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151203385H",
archivePrefix= "arXiv",
author = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
eprint = "1512.03385",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = dec,
primaryClass = "cs.CV",
title = "{Deep Residual Learning for Image Recognition}",
year = 2015
}
@ARTICLE{simonyan-2014-very-deep,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1409.1556S",
archivePrefix= "arXiv",
author = "{Simonyan}, K. and {Zisserman}, A.",
eprint = "1409.1556",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = sep,
primaryClass = "cs.CV",
title = "{Very Deep Convolutional Networks for Large-Scale Image
Recognition}",
year = 2014
}
@ARTICLE{lin-2013-networ-in-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.4400L",
archivePrefix= "arXiv",
author = "{Lin}, M. and {Chen}, Q. and {Yan}, S.",
eprint = "1312.4400",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning",
month = dec,
title = "{Network In Network}",
year = 2013
}
@ARTICLE{montufar-2014-number-linear,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1402.1869M",
archivePrefix= "arXiv",
author = "{Mont{\'u}far}, G. and {Pascanu}, R. and {Cho}, K. and
{Bengio}, Y.",
eprint = "1402.1869",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning,
Computer Science - Neural and Evolutionary Computing",
month = feb,
primaryClass = "stat.ML",
title = "{On the Number of Linear Regions of Deep Neural Networks}",
year = 2014
}
@ARTICLE{luxburg-2010-clust-stabil,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2010arXiv1007.1075V",
archivePrefix= "arXiv",
author = "{von Luxburg}, U.",
eprint = "1007.1075",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning",
month = jul,
primaryClass = "stat.ML",
title = "{Clustering Stability: An Overview}",
year = 2010
}
@ARTICLE{shah-2014-bayes-regres-bitcoin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.1231S",
archivePrefix= "arXiv",
author = "{Shah}, D. and {Zhang}, K.",
eprint = "1410.1231",
journal = "ArXiv e-prints",
keywords = "Computer Science - Artificial Intelligence, Mathematics -
Statistics Theory",
month = oct,
primaryClass = "cs.AI",
title = "{Bayesian Regression and Bitcoin}",
year = 2014
}
@article{domingos-2012-few-useful-things,
title = "A few useful things to know about machine learning",
author = "Domingos, Pedro",
journal = "Communications of the ACM",
volume = 55,
number = 10,
pages = "78-87",
year = 2012,
publisher = "ACM"
}
@ARTICLE{thakur-2015-autoc,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150702188T",
archivePrefix= "arXiv",
author = "{Thakur}, A. and {Krohn-Grimberghe}, A.",
eprint = "1507.02188",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = jul,
primaryClass = "stat.ML",
title = "{AutoCompete: A Framework for Machine Learning Competition}",
year = 2015
}
@article{huang-1998-k-modes,
title = "Extensions to the k-means algorithm for clustering large data
sets with categorical values",
author = "Huang, Zhexue",
journal = "Data mining and knowledge discovery",
volume = 2,
number = 3,
pages = "283-304",
year = 1998,
publisher = "Springer"
}
@inproceedings{he-2006-approximation-algorithms,
title = "Approximation algorithms for k-modes clustering",
author = "He, Zengyou and Deng, Shengchun and Xu, Xiaofei",
booktitle = "International Conference on Intelligent Computing",
pages = "296-302",
year = 2006,
organization = "Springer"
}
@inproceedings{plant-2011-inconco-interp-cluster,
title = "Inconco: interpretable clustering of numerical and
categorical objects",
author = "Plant, Claudia and B{\"o}hm, Christian",
booktitle = "Proceedings of the 17th ACM SIGKDD international conference
on Knowledge discovery and data mining",
pages = "1127-1135",
year = 2011,
organization = "ACM"
}
@article{kim-2004-fuzzy-cluster,
title = "Fuzzy clustering of categorical data using fuzzy centroids",
author = "Kim, Dae-Won and Lee, Kwang H and Lee, Doheon",
journal = "Pattern Recognition Letters",
volume = 25,
number = 11,
pages = "1263-1271",
year = 2004,
publisher = "Elsevier"
}
@article{guha-2000-rock-robust-cluster,
title = "ROCK: A robust clustering algorithm for categorical
attributes",
author = "Guha, Sudipto and Rastogi, Rajeev and Shim, Kyuseok",
journal = "Information systems",
volume = 25,
number = 5,
pages = "345-366",
year = 2000,
publisher = "Elsevier"
}
@inproceedings{louppe-2013-understanding-variable-import,
title = "Understanding variable importances in forests of randomized
trees",
author = "Louppe, Gilles and Wehenkel, Louis and Sutera, Antonio and
Geurts, Pierre",
booktitle = "Advances in neural information processing systems",
pages = "431-439",
year = 2013
}
@article{gelman-2008-scaling-regress-inputs,
title = "Scaling regression inputs by dividing by two standard
deviations",
author = "Gelman, Andrew",
journal = "Statistics in medicine",
volume = 27,
number = 15,
pages = "2865-2873",
year = 2008,
publisher = "Wiley Online Library"
}
@article{reshef-2011-detecting-novel-assoc,
title = "Detecting novel associations in large data sets",
author = "Reshef, David N and Reshef, Yakir A and Finucane, Hilary K
and Grossman, Sharon R and McVean, Gilean and Turnbaugh,
Peter J and Lander, Eric S and Mitzenmacher, Michael and
Sabeti, Pardis C",
journal = "science",
volume = 334,
number = 6062,
pages = "1518-1524",
year = 2011,
publisher = "American Association for the Advancement of Science"
}
@article{cawley-2010-over-fitting,
title = "On over-fitting in model selection and subsequent selection
bias in performance evaluation",
author = "Cawley, Gavin C and Talbot, Nicola LC",
journal = "Journal of Machine Learning Research",
volume = 11,
number = "Jul",
pages = "2079-2107",
year = 2010
}
@article{varma-2006-bias-error-estim,
title = "Bias in error estimation when using cross-validation for
model selection",
author = "Varma, Sudhir and Simon, Richard",
journal = "BMC bioinformatics",
volume = 7,
number = 1,
pages = 91,
year = 2006,
publisher = "BioMed Central"
}
@ARTICLE{heaton-2016-deep-learn-finan,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160206561H",
archivePrefix= "arXiv",
author = "{Heaton}, J.~B. and {Polson}, N.~G. and {Witte}, J.~H.",
eprint = "1602.06561",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = feb,
primaryClass = "cs.LG",
title = "{Deep Learning in Finance}",
year = 2016
}
@ARTICLE{sirignano-2016-deep-learn-mortg-risk,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160702470S",
archivePrefix= "arXiv",
author = "{Sirignano}, J. and {Sadhwani}, A. and {Giesecke}, K.",
eprint = "1607.02470",
journal = "ArXiv e-prints",
keywords = "Quantitative Finance - Statistical Finance",
month = jul,
primaryClass = "q-fin.ST",
title = "{Deep Learning for Mortgage Risk}",
year = 2016
}
@article{heaton-2016-deep-learning-finance,
title = "Deep learning for finance: deep portfolios",
author = "Heaton, JB and Polson, NG and Witte, Jan Hendrik",
journal = "Applied Stochastic Models in Business and Industry",
year = 2016,
publisher = "Wiley Online Library"
}
@ARTICLE{dixon-2016-class-based,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160308604D",
archivePrefix= "arXiv",
author = "{Dixon}, M. and {Klabjan}, D. and {Bang}, J.~H.",
eprint = "1603.08604",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computational
Engineering, Finance, and Science",
month = mar,
primaryClass = "cs.LG",
title = "{Classiffication-Based Financial Markets Prediction Using
Deep Neural Networks}",
year = 2016
}
@article{langkvist-2014-review-unsuper-feature,
title = "A review of unsupervised feature learning and deep learning
for time-series modeling",
author = "L{\"a}ngkvist, Martin and Karlsson, Lars and Loutfi, Amy",
journal = "Pattern Recognition Letters",
volume = 42,
pages = "11-24",
year = 2014,
publisher = "Elsevier"
}
@article{qiu-2016-predicting-direction,
title = "Predicting the Direction of Stock Market Index Movement Using
an Optimized Artificial Neural Network Model",
author = "Qiu, Mingyue and Song, Yu",
journal = "PLoS One",
volume = 11,
number = 5,
year = 2016,
publisher = "Public Library of Science"
}
@inproceedings{yang-2016-ensemble-model-stock,
title = "Ensemble Model for Stock Price Movement Trend Prediction on
Different Investing Periods",
author = "Yang, Jian and Rao, Ruonan and Hong, Pei and Ding, Peng",
booktitle = "2016 12th International Conference on Computational
Intelligence and Security (CIS)",
pages = "358-361",
year = 2016,
organization = "IEEE"
}
@article{lecun-2015-deep-learning,
title = "Deep learning",
author = "LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey",
journal = "Nature",
volume = 521,
number = 7553,
pages = "436-444",
year = 2015,
publisher = "Nature Research"
}
@ARTICLE{bollen-2010-twitt-mood,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2010arXiv1010.3003B",
archivePrefix= "arXiv",
author = "{Bollen}, J. and {Mao}, H. and {Zeng}, X.-J.",
eprint = "1010.3003",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computational Engineering, Finance, and
Science, Computer Science - Computation and Language,
Computer Science - Social and Information Networks, Physics -
Physics and Society",
month = oct,
primaryClass = "cs.CE",
title = "{Twitter Mood Predicts the Stock market}",
year = 2010
}
@ARTICLE{goerg-2012-forec-compon-analy-forec,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1205.4591G",
archivePrefix= "arXiv",
author = "{Goerg}, G.~M.",
eprint = "1205.4591",
journal = "ArXiv e-prints",
keywords = "Statistics - Methodology, Statistics - Machine Learning",
month = may,
primaryClass = "stat.ME",
title = "{Forecastable Component Analysis (ForeCA)}",
year = 2012
}
@ARTICLE{fehrer-2015-improv-decis,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150801993F",
archivePrefix= "arXiv",
author = "{Fehrer}, R. and {Feuerriegel}, S.",
eprint = "1508.01993",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Computation
and Language, Computer Science - Learning",
month = aug,
primaryClass = "stat.ML",
title = "{Improving Decision Analytics With Deep Learning: The Case of
Financial Disclosures}",
year = 2015
}
@article{kaastra-1996-designing-neural-net,
title = "Designing a neural network for forecasting financial and
economic time series",
author = "Kaastra, Iebeling and Boyd, Milton",
journal = "Neurocomputing",
volume = 10,
number = 3,
pages = "215-236",
year = 1996,
publisher = "Elsevier"
}
@article{ahmed-2010-empirical-comparison,
title = "An empirical comparison of machine learning models for time
series forecasting",
author = "Ahmed, Nesreen K and Atiya, Amir F and Gayar, Neamat El and
El-Shishiny, Hisham",
journal = "Econometric Reviews",
volume = 29,
number = "5-6",
pages = "594-621",
year = 2010,
publisher = "Taylor \& Francis"
}
@article{dubovikov-2004-dimension-minimal-cover,
title = "Dimension of the minimal cover and fractal analysis of time
series",
author = "Dubovikov, MM and Starchenko, NV and Dubovikov, MS",
journal = "Physica A: Statistical Mechanics and its Applications",
volume = 339,
number = 3,
pages = "591-608",
year = 2004,
publisher = "Elsevier"
}
@inproceedings{dalto-2015-deep-neural-net,
title = "Deep neural networks for ultra-short-term wind forecasting",
author = "Dalto, Mladen and Matu{\v{s}}ko, Jadranko and Va{\v{s}}ak,
Mario",
booktitle = "Industrial Technology (ICIT), 2015 IEEE International
Conference on",
pages = "1657-1663",
year = 2015,
organization = "IEEE"
}
@inproceedings{goodfellow-2014-gan,
title = "Generative adversarial nets",
author = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and
Xu, Bing and Warde-Farley, David and Ozair, Sherjil and
Courville, Aaron and Bengio, Yoshua",
booktitle = "Advances in neural information processing systems",
pages = "2672-2680",
year = 2014
}
@ARTICLE{goodfellow-2014-explain-harnes-adver-examp,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.6572G",
archivePrefix= "arXiv",
author = "{Goodfellow}, I.~J. and {Shlens}, J. and {Szegedy}, C.",
eprint = "1412.6572",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = dec,
primaryClass = "stat.ML",
title = "{Explaining and Harnessing Adversarial Examples}",
year = 2014
}
@inproceedings{denton-2015-deep-generative-image,
title = "Deep Generative Image Models using a Laplacian Pyramid of
Adversarial Networks",
author = "Denton, Emily L and Chintala, Soumith and Fergus, Rob and
others",
booktitle = "Advances in neural information processing systems",
pages = "1486-1494",
year = 2015
}
@ARTICLE{radford-2015-dcgan,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106434R",
archivePrefix= "arXiv",
author = "{Radford}, A. and {Metz}, L. and {Chintala}, S.",
eprint = "1511.06434",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computer
Vision and Pattern Recognition",
month = nov,
primaryClass = "cs.LG",
title = "{Unsupervised Representation Learning With Deep Convolutional
Generative Adversarial Networks}",
year = 2015
}
@inproceedings{dosovitskiy-2015-learning-to-generate,
title = "Learning to generate chairs with convolutional neural
networks",
author = "Dosovitskiy, Alexey and Tobias Springenberg, Jost and Brox,
Thomas",
booktitle = "Proceedings of the IEEE Conference on Computer Vision and
Pattern Recognition",
pages = "1538-1546",
year = 2015
}
@ARTICLE{burda-2015-impor-weigh-autoen,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150900519B",
archivePrefix= "arXiv",
author = "{Burda}, Y. and {Grosse}, R. and {Salakhutdinov}, R.",
eprint = "1509.00519",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Statistics - Machine Learning",
month = sep,
primaryClass = "cs.LG",
title = "{Importance Weighted Autoencoders}",
year = 2015
}
@ARTICLE{ganin-2014-unsup-domain,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1409.7495G",
archivePrefix= "arXiv",
author = "{Ganin}, Y. and {Lempitsky}, V.",
eprint = "1409.7495",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning,
Computer Science - Neural and Evolutionary Computing",
month = sep,
primaryClass = "stat.ML",
title = "{Unsupervised Domain Adaptation By Backpropagation}",
year = 2014
}
@ARTICLE{makhzani-2015-adver-autoen,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151105644M",
archivePrefix= "arXiv",
author = "{Makhzani}, A. and {Shlens}, J. and {Jaitly}, N. and
{Goodfellow}, I. and {Frey}, B.",
eprint = "1511.05644",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = nov,
primaryClass = "cs.LG",
title = "{Adversarial Autoencoders}",
year = 2015
}
@ARTICLE{szegedy-2013-intrig-proper-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.6199S",
archivePrefix= "arXiv",
author = "{Szegedy}, C. and {Zaremba}, W. and {Sutskever}, I. and
{Bruna}, J. and {Erhan}, D. and {Goodfellow}, I. and
{Fergus}, R.",
eprint = "1312.6199",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = dec,
primaryClass = "cs.CV",
title = "{Intriguing Properties of Neural networks}",
year = 2013
}
@ARTICLE{kurakin-2016-adver-examp-physic,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160702533K",
archivePrefix= "arXiv",
author = "{Kurakin}, A. and {Goodfellow}, I. and {Bengio}, S.",
eprint = "1607.02533",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Cryptography and Security, Computer
Science - Learning, Statistics - Machine Learning",
month = jul,
primaryClass = "cs.CV",
title = "{Adversarial Examples in the Physical world}",
year = 2016
}
@ARTICLE{mirza-2014-condit-gener-adver-nets,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1411.1784M",
archivePrefix= "arXiv",
author = "{Mirza}, M. and {Osindero}, S.",
eprint = "1411.1784",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Artificial
Intelligence, Computer Science - Computer Vision and Pattern
Recognition, Statistics - Machine Learning",
month = nov,
primaryClass = "cs.LG",
title = "{Conditional Generative Adversarial Nets}",
year = 2014
}
@ARTICLE{goodfellow-2017-nips-tutor,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170100160G",
archivePrefix= "arXiv",
author = "{Goodfellow}, I.",
eprint = "1701.00160",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = dec,
primaryClass = "cs.LG",
title = "{NIPS 2016 Tutorial: Generative Adversarial Networks}",
year = 2017
}
@ARTICLE{arjovsky-2017-wasser-gan,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170107875A",
archivePrefix= "arXiv",
author = "{Arjovsky}, M. and {Chintala}, S. and {Bottou}, L.",
eprint = "1701.07875",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = jan,
primaryClass = "stat.ML",
title = "{Wasserstein GAN}",
year = 2017
}
@inproceedings{ng-2000-algorithms-inverse,
title = "Algorithms for Inverse Reinforcement Learning",
author = "Ng, Andrew Y and Russell, Stuart",
booktitle = "in Proc. 17th International Conf. on Machine Learning",
year = 2000
}
@ARTICLE{mnih-2013-playin-atari,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.5602M",
archivePrefix= "arXiv",
author = "{Mnih}, V. and {Kavukcuoglu}, K. and {Silver}, D. and
{Graves}, A. and {Antonoglou}, I. and {Wierstra}, D. and
{Riedmiller}, M.",
eprint = "1312.5602",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = dec,
primaryClass = "cs.LG",
title = "{Playing Atari With Deep Reinforcement Learning}",
year = 2013
}
@ARTICLE{heaton-2016-deep-portf-theor,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160507230H",
archivePrefix= "arXiv",
author = "{Heaton}, J.~B. and {Polson}, N.~G. and {Witte}, J.~H.",
eprint = "1605.07230",
journal = "ArXiv e-prints",
keywords = "Quantitative Finance - Portfolio Management, Computer Science
- Learning",
month = may,
primaryClass = "q-fin.PM",
title = "{Deep Portfolio Theory}",
year = 2016
}
@ARTICLE{karpathy-2015-visual-under-recur-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150602078K",
archivePrefix= "arXiv",
author = "{Karpathy}, A. and {Johnson}, J. and {Fei-Fei}, L.",
eprint = "1506.02078",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Computer Science - Neural and Evolutionary
Computing",
month = jun,
primaryClass = "cs.LG",
title = "{Visualizing and Understanding Recurrent Networks}",
year = 2015
}
@ARTICLE{graves-2014-neural-turin-machin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.5401G",
archivePrefix= "arXiv",
author = "{Graves}, A. and {Wayne}, G. and {Danihelka}, I.",
eprint = "1410.5401",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing",
month = oct,
title = "{Neural Turing Machines}",
year = 2014
}
@ARTICLE{bahdanau-2014-bahdanau-attention,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1409.0473B",
archivePrefix= "arXiv",
author = "{Bahdanau}, D. and {Cho}, K. and {Bengio}, Y.",
eprint = "1409.0473",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing, Statistics
- Machine Learning",
month = sep,
primaryClass = "cs.CL",
title = "{Neural Machine Translation By Jointly Learning To Align and
Translate}",
year = 2014
}
@ARTICLE{vinyals-2014-show-tell,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1411.4555V",
archivePrefix= "arXiv",
author = "{Vinyals}, O. and {Toshev}, A. and {Bengio}, S. and {Erhan},
D.",
eprint = "1411.4555",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = nov,
primaryClass = "cs.CV",
title = "{Show and Tell: A Neural Image Caption Generator}",
year = 2014
}
@inproceedings{glorot-2010-under-diff,
title = "Understanding the difficulty of training deep feedforward
neural networks.",
author = "Glorot, Xavier and Bengio, Yoshua",
booktitle = "Aistats",
volume = 9,
pages = "249-256",
year = 2010
}
@article{lecun-1998-gradient-based,
title = "Gradient-based learning applied to document recognition",
author = "LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and
Haffner, Patrick",
journal = "Proceedings of the IEEE",
volume = 86,
number = 11,
pages = "2278-2324",
year = 1998,
publisher = "IEEE"
}
@article{gosavi-2009-reinforcement-learning,
title = "Reinforcement learning: A tutorial survey and recent
advances",
author = "Gosavi, Abhijit",
journal = "INFORMS Journal on Computing",
volume = 21,
number = 2,
pages = "178-192",
year = 2009,
publisher = "INFORMS"
}
@ARTICLE{zeiler-2013-visual-under-convol-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1311.2901Z",
archivePrefix= "arXiv",
author = "{Zeiler}, M.~D and {Fergus}, R.",
eprint = "1311.2901",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = nov,
primaryClass = "cs.CV",
title = "{Visualizing and Understanding Convolutional Networks}",
year = 2013
}
@ARTICLE{krizhevsky-2014-one-weird,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1404.5997K",
archivePrefix= "arXiv",
author = "{Krizhevsky}, A.",
eprint = "1404.5997",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Distributed, Parallel, and Cluster
Computing, Computer Science - Learning",
month = apr,
title = "{One Weird Trick for Parallelizing Convolutional Neural
networks}",
year = 2014
}
@inproceedings{zeiler-2011-adaptive-deconv,
title = "Adaptive deconvolutional networks for mid and high level
feature learning",
author = "Zeiler, Matthew D and Taylor, Graham W and Fergus, Rob",
booktitle = "Computer Vision (ICCV), 2011 IEEE International Conference
on",
pages = "2018-2025",
year = 2011,
organization = "IEEE"
}
@ARTICLE{dumoulin-2016-guide-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160307285D",
archivePrefix= "arXiv",
author = "{Dumoulin}, V. and {Visin}, F.",
eprint = "1603.07285",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning,
Computer Science - Neural and Evolutionary Computing",
month = mar,
primaryClass = "stat.ML",
title = "{A Guide To Convolution Arithmetic for Deep learning}",
year = 2016
}
@article{beck-2009-fast-iter,
title = "A fast iterative shrinkage-thresholding algorithm for linear
inverse problems",
author = "Beck, Amir and Teboulle, Marc",
journal = "SIAM journal on imaging sciences",
volume = 2,
number = 1,
pages = "183-202",
year = 2009,
publisher = "SIAM"
}
@ARTICLE{redmon-2015-you-only-look-once,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150602640R",
archivePrefix= "arXiv",
author = "{Redmon}, J. and {Divvala}, S. and {Girshick}, R. and
{Farhadi}, A.",
eprint = "1506.02640",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = jun,
primaryClass = "cs.CV",
title = "{You Only Look Once: Unified, Real-Time Object Detection}",
year = 2015
}
@ARTICLE{rastegari-2016-xnor-net,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160305279R",
archivePrefix= "arXiv",
author = "{Rastegari}, M. and {Ordonez}, V. and {Redmon}, J. and
{Farhadi}, A.",
eprint = "1603.05279",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = mar,
primaryClass = "cs.CV",
title = "{XNOR-Net: Imagenet Classification Using Binary Convolutional
Neural Networks}",
year = 2016
}
@inproceedings{zeiler-2010-deconvolutional-net,
title = "Deconvolutional networks",
author = "Zeiler, Matthew D and Krishnan, Dilip and Taylor, Graham W
and Fergus, Rob",
booktitle = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE
Conference on",
pages = "2528-2535",
year = 2010,
organization = "IEEE"
}
@ARTICLE{mikolov-2013-effic-estim,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1301.3781M",
archivePrefix= "arXiv",
author = "{Mikolov}, T. and {Chen}, K. and {Corrado}, G. and {Dean},
J.",
eprint = "1301.3781",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jan,
primaryClass = "cs.CL",
title = "{Efficient Estimation of Word Representations in Vector
Space}",
year = 2013
}
@ARTICLE{greff-2015-lstm-search,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150304069G",
archivePrefix= "arXiv",
author = "{Greff}, K. and {Srivastava}, R.~K. and {Koutn{\'{\i}}k},
J. and {Steunebrink}, B.~R. and {Schmidhuber}, J.",
eprint = "1503.04069",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Learning, 68T10, I.2.6, I.2.7, I.5.1,
H.5.5",
month = mar,
title = "{LSTM: A Search Space Odyssey}",
year = 2015
}
@ARTICLE{mnih-2016-async-method,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160201783M",
archivePrefix= "arXiv",
author = "{Mnih}, V. and {Puigdom{\`e}nech Badia}, A. and {Mirza},
M. and {Graves}, A. and {Lillicrap}, T.~P. and {Harley},
T. and {Silver}, D. and {Kavukcuoglu}, K.",
eprint = "1602.01783",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = feb,
primaryClass = "cs.LG",
title = "{Asynchronous Methods for Deep Reinforcement Learning}",
year = 2016
}
@article{bengio-2003-neural-prob,
title = "A neural probabilistic language model",
author = "Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal
and Jauvin, Christian",
journal = "Journal of machine learning research",
volume = 3,
number = "Feb",
pages = "1137-1155",
year = 2003
}
@phdthesis{mikolov-2007-language-model,
title = "Language Modeling for Speech Recognition in Czech",
author = "Mikolov, Tom{\'a}{\v{s}}",
year = 2007,
school = "Masters thesis, Brno University of Technology"
}
@inproceedings{mikolov-2013-distributed-repre,
title = "Distributed representations of words and phrases and their
compositionality",
author = "Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado,
Greg S and Dean, Jeff",
booktitle = "Advances in neural information processing systems",
pages = "3111-3119",
year = 2013
}
@ARTICLE{luong-2015-luong-attention,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150804025L",
archivePrefix= "arXiv",
author = "{Luong}, M.-T. and {Pham}, H. and {Manning}, C.~D.",
eprint = "1508.04025",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL",
title = "{Effective Approaches To Attention-Based Neural Machine
Translation}",
year = 2015
}
@ARTICLE{chorowski-2015-atten-based,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150607503C",
archivePrefix= "arXiv",
author = "{Chorowski}, J. and {Bahdanau}, D. and {Serdyuk}, D. and
{Cho}, K. and {Bengio}, Y.",
eprint = "1506.07503",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing, Statistics
- Machine Learning",
month = jun,
primaryClass = "cs.CL",
title = "{Attention-Based Models for Speech Recognition}",
year = 2015
}
@ARTICLE{mnih-2014-recur-model-visual-atten,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1406.6247M",
archivePrefix= "arXiv",
author = "{Mnih}, V. and {Heess}, N. and {Graves}, A. and
{Kavukcuoglu}, K.",
eprint = "1406.6247",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computer
Vision and Pattern Recognition, Statistics - Machine
Learning",
month = jun,
primaryClass = "cs.LG",
title = "{Recurrent Models of Visual Attention}",
year = 2014
}
@ARTICLE{ba-2014-multip-objec,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.7755B",
archivePrefix= "arXiv",
author = "{Ba}, J. and {Mnih}, V. and {Kavukcuoglu}, K.",
eprint = "1412.7755",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computer
Vision and Pattern Recognition, Computer Science - Neural and
Evolutionary Computing",
month = dec,
primaryClass = "cs.LG",
title = "{Multiple Object Recognition With Visual Attention}",
year = 2014
}
@ARTICLE{gregor-2015-draw-recur,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150204623G",
archivePrefix= "arXiv",
author = "{Gregor}, K. and {Danihelka}, I. and {Graves}, A. and
{Jimenez Rezende}, D. and {Wierstra}, D.",
eprint = "1502.04623",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = feb,
primaryClass = "cs.CV",
title = "{DRAW: A Recurrent Neural Network For Image Generation}",
year = 2015
}
@ARTICLE{xu-2015-show-atten-tell,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150203044X",
archivePrefix= "arXiv",
author = "{Xu}, K. and {Ba}, J. and {Kiros}, R. and {Cho}, K. and
{Courville}, A. and {Salakhutdinov}, R. and {Zemel}, R. and
{Bengio}, Y.",
eprint = "1502.03044",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computer
Vision and Pattern Recognition",
month = feb,
primaryClass = "cs.LG",
title = "{Show, Attend and Tell: Neural Image Caption Generation With
Visual Attention}",
year = 2015
}
@ARTICLE{zaremba-2014-learn-to-execute,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.4615Z",
archivePrefix= "arXiv",
author = "{Zaremba}, W. and {Sutskever}, I.",
eprint = "1410.4615",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Artificial Intelligence, Computer Science
- Learning",
month = oct,
title = "{Learning To Execute}",
year = 2014
}
@ARTICLE{vinyals-2014-gramm-as-foreig-languag,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.7449V",
archivePrefix= "arXiv",
author = "{Vinyals}, O. and {Kaiser}, L. and {Koo}, T. and {Petrov},
S. and {Sutskever}, I. and {Hinton}, G.",
eprint = "1412.7449",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Statistics - Machine Learning",
month = dec,
primaryClass = "cs.CL",
title = "{Grammar As a Foreign Language}",
year = 2014
}
@ARTICLE{hermann-2015-teach-machin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150603340H",
archivePrefix= "arXiv",
author = "{Hermann}, K.~M. and {Ko{\v c}isk{\'y}}, T. and
{Grefenstette}, E. and {Espeholt}, L. and {Kay}, W. and
{Suleyman}, M. and {Blunsom}, P.",
eprint = "1506.03340",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Neural and Evolutionary
Computing",
month = jun,
primaryClass = "cs.CL",
title = "{Teaching Machines To Read and Comprehend}",
year = 2015
}
@ARTICLE{sukhbaatar-2015-end-to-end,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150308895S",
archivePrefix= "arXiv",
author = "{Sukhbaatar}, S. and {Szlam}, A. and {Weston}, J. and
{Fergus}, R.",
eprint = "1503.08895",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Computation and Language",
month = mar,
title = "{End-To-End Memory Networks}",
year = 2015
}
@ARTICLE{zaremba-2015-reinf-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150500521Z",
archivePrefix= "arXiv",
author = "{Zaremba}, W. and {Sutskever}, I.",
eprint = "1505.00521",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = may,
primaryClass = "cs.LG",
title = "{Reinforcement Learning Neural Turing Machines - Revised}",
year = 2015
}
@ARTICLE{joulin-2016-bag-trick,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160701759J",
archivePrefix= "arXiv",
author = "{Joulin}, A. and {Grave}, E. and {Bojanowski}, P. and
{Mikolov}, T.",
eprint = "1607.01759",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jul,
primaryClass = "cs.CL",
title = "{Bag of Tricks for Efficient Text Classification}",
year = 2016
}
@ARTICLE{kim-2014-convol-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1408.5882K",
archivePrefix= "arXiv",
author = "{Kim}, Y.",
eprint = "1408.5882",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = aug,
primaryClass = "cs.CL",
title = "{Convolutional Neural Networks for Sentence Classification}",
year = 2014
}
@ARTICLE{graves-2013-gener-sequen,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1308.0850G",
archivePrefix= "arXiv",
author = "{Graves}, A.",
eprint = "1308.0850",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Computation and Language",
month = aug,
title = "{Generating Sequences With Recurrent Neural Networks}",
year = 2013
}
@inproceedings{yang-2016-hierarchical-attent,
title = "Hierarchical attention networks for document classification",
author = "Yang, Zichao and Yang, Diyi and Dyer, Chris and He, Xiaodong
and Smola, Alex and Hovy, Eduard",
booktitle = "Proceedings of NAACL-HLT",
pages = "1480-1489",
year = 2016
}
@inproceedings{lai-2015-recurrent-conv,
title = "Recurrent Convolutional Neural Networks for Text
Classification.",
author = "Lai, Siwei and Xu, Liheng and Liu, Kang and Zhao, Jun",
booktitle = "AAAI",
volume = 333,
pages = "2267-2273",
year = 2015
}
@inproceedings{ding-2015-deep-learn,
title = "Deep Learning for Event-Driven Stock Prediction.",
author = "Ding, Xiao and Zhang, Yue and Liu, Ting and Duan, Junwen",
booktitle = "IJCAI",
pages = "2327-2333",
year = 2015
}
@inproceedings{socher-2013-reasoning-neural,
title = "Reasoning with neural tensor networks for knowledge base
completion",
author = "Socher, Richard and Chen, Danqi and Manning, Christopher D
and Ng, Andrew",
booktitle = "Advances in neural information processing systems",
pages = "926-934",
year = 2013
}
@inproceedings{angeli-2015-lever-ling-struct,
title = "Leveraging linguistic structure for open domain information
extraction",
author = "Angeli, Gabor and Premkumar, Melvin Johnson and Manning,
Christopher D",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics (ACL 2015)",
year = 2015
}
@inproceedings{benko-2007-open-info-extra,
author = "Banko, Michele and Cafarella, Michael J. and Soderland,
Stephen and Broadhead, Matt and Etzioni, Oren",
title = "Open Information Extraction from the Web",
booktitle = "Proceedings of the 20th International Joint Conference on
Artifical Intelligence",
series = "IJCAI'07",
year = 2007,
location = "Hyderabad, India",
pages = "2670-2676",
numpages = 7,
url = "http://dl.acm.org/citation.cfm?id=1625275.1625705",
acmid = 1625705,
publisher = "Morgan Kaufmann Publishers Inc.",
address = "San Francisco, CA, USA"
}
@inproceedings{si-2014-exploit-social,
title = "Exploiting Social Relations and Sentiment for Stock
Prediction.",
author = "Si, Jianfeng and Mukherjee, Arjun and Liu, Bing and Pan,
Sinno Jialin and Li, Qing and Li, Huayi",
booktitle = "EMNLP",
volume = 14,
pages = "1139-1145",
year = 2014
}
@inproceedings{ding-2014-usings-struct-event,
title = "Using Structured Events to Predict Stock Price Movement: An
Empirical Investigation",
author = "Xiao Ding and Yue Zhang and Ting Liu and Junwen Duan",
booktitle = "EMNLP",
year = 2014
}
@inproceedings{pennington-2014-glove-global-vec,
author = "Jeffrey Pennington and Richard Socher and Christopher
D. Manning",
booktitle = "Empirical Methods in Natural Language Processing (EMNLP)",
title = "GloVe: Global Vectors for Word Representation",
year = 2014,
pages = "1532-1543",
url = "http://www.aclweb.org/anthology/D14-1162"
}
@inproceedings{baroni-2014-dont-count-pred,
title = "Don't count, predict! A systematic comparison of
context-counting vs. context-predicting semantic vectors",
author = "Marco Baroni and Georgiana Dinu and Germ{\'a}n Kruszewski",
booktitle = "ACL",
year = 2014
}
@inproceedings{morin-2005-hiera-prob,
title = "Hierarchical Probabilistic Neural Network Language Model.",
author = "Morin, Frederic and Bengio, Yoshua",
booktitle = "Aistats",
volume = 5,
pages = "246-252",
year = 2005,
organization = "Citeseer"
}
@ARTICLE{rong-2014-param-learn-explain,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1411.2738R",
archivePrefix= "arXiv",
author = "{Rong}, X.",
eprint = "1411.2738",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = nov,
primaryClass = "cs.CL",
title = "{word2vec Parameter Learning Explained}",
year = 2014
}
@inproceedings{mnih-2009-scalable-hiera,
title = "A scalable hierarchical distributed language model",
author = "Mnih, Andriy and Hinton, Geoffrey E",
booktitle = "Advances in neural information processing systems",
pages = "1081-1088",
year = 2009
}
@inproceedings{davis-2006-rela-pre-rec,
title = "The relationship between Precision-Recall and ROC curves",
author = "Davis, Jesse and Goadrich, Mark",
booktitle = "Proceedings of the 23rd international conference on Machine
learning",
pages = "233-240",
year = 2006,
organization = "ACM"
}
@ARTICLE{wojna-2017-atten-based,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170403549W",
archivePrefix= "arXiv",
author = "{Wojna}, Z. and {Gorban}, A. and {Lee}, D.-S. and {Murphy},
K. and {Yu}, Q. and {Li}, Y. and {Ibarz}, J.",
eprint = "1704.03549",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = apr,
primaryClass = "cs.CV",
title = "{Attention-Based Extraction of Structured Information From
Street View Imagery}",
year = 2017
}
@inproceedings{lee-2013-pesu-label,
title = {Pseudo-Label : The Simple and Efficient Semi-Supervised
Learning Method for Deep Neural Networks},
author = {Dong-Hyun Lee},
year = 2013
}
@ARTICLE{gehring-2017-convol-sequen,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170503122G",
archivePrefix= "arXiv",
author = "{Gehring}, J. and {Auli}, M. and {Grangier}, D. and {Yarats},
D. and {Dauphin}, Y.~N.",
eprint = "1705.03122",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = may,
primaryClass = "cs.CL",
title = "{Convolutional Sequence To Sequence Learning}",
year = 2017
}
@inproceedings{dean-2012-large-scale,
title = "Large scale distributed deep networks",
author = "Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen,
Kai and Devin, Matthieu and Mao, Mark and Senior, Andrew and
Tucker, Paul and Yang, Ke and Le, Quoc V and others",
booktitle = "Advances in neural information processing systems",
pages = "1223-1231",
year = 2012
}
@inproceedings{larochelle-2010-learn-combine,
title = "Learning to combine foveal glimpses with a third-order
Boltzmann machine",
author = "Larochelle, Hugo and Hinton, Geoffrey E",
booktitle = "Advances in neural information processing systems",
pages = "1243-1251",
year = 2010
}
@ARTICLE{denil-2011-learn-where,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2011arXiv1109.3737D",
archivePrefix= "arXiv",
author = "{Denil}, M. and {Bazzani}, L. and {Larochelle}, H. and {de
Freitas}, N.",
eprint = "1109.3737",
journal = "ArXiv e-prints",
keywords = "Computer Science - Artificial Intelligence",
month = sep,
primaryClass = "cs.AI",
title = "{Learning Where To Attend With Deep Architectures for Image
Tracking}",
year = 2011
}
@ARTICLE{yin-2015-abcnn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151205193Y",
archivePrefix= "arXiv",
author = "{Yin}, W. and {Sch{\"u}tze}, H. and {Xiang}, B. and {Zhou},
B.",
eprint = "1512.05193",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = dec,
primaryClass = "cs.CL",
title = "{ABCNN: Attention-Based Convolutional Neural Network for
Modeling Sentence Pairs}",
year = 2015
}
@ARTICLE{vinyals-2015-point-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150603134V",
archivePrefix= "arXiv",
author = "{Vinyals}, O. and {Fortunato}, M. and {Jaitly}, N.",
eprint = "1506.03134",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science -
Computational Geometry, Computer Science - Learning, Computer
Science - Neural and Evolutionary Computing",
month = jun,
primaryClass = "stat.ML",
title = "{Pointer Networks}",
year = 2015
}
@ARTICLE{johnson-2016-percep-losses,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160308155J",
archivePrefix= "arXiv",
author = "{Johnson}, J. and {Alahi}, A. and {Fei-Fei}, L.",
eprint = "1603.08155",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning",
month = mar,
primaryClass = "cs.CV",
title = "{Perceptual Losses for Real-Time Style Transfer and
Super-Resolution}",
year = 2016
}
@inproceedings{lu-2012-combin-sketch,
title = "Combining sketch and tone for pencil drawing production",
author = "Lu, Cewu and Xu, Li and Jia, Jiaya",
booktitle = "Proceedings of the Symposium on Non-Photorealistic Animation
and Rendering",
pages = "65-73",
year = 2012,
organization = "Eurographics Association"
}
@ARTICLE{chan-2015-pcanet,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015ITIP...24.5017C",
archivePrefix= "arXiv",
author = "{Chan}, T.-H. and {Jia}, K. and {Gao}, S. and {Lu}, J. and
{Zeng}, Z. and {Ma}, Y.",
doi = "10.1109/TIP.2015.2475625",
eprint = "1404.3606",
journal = "IEEE Transactions on Image Processing",
month = dec,
pages = "5017-5032",
primaryClass = "cs.CV",
title = "{PCANet: A Simple Deep Learning Baseline for Image
Classification?}",
url = "https://doi.org/10.1109/TIP.2015.2475625",
volume = 24,
year = 2015
}
@ARTICLE{kiros-2015-skip-thought,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150606726K",
archivePrefix= "arXiv",
author = "{Kiros}, R. and {Zhu}, Y. and {Salakhutdinov}, R. and
{Zemel}, R.~S. and {Torralba}, A. and {Urtasun}, R. and
{Fidler}, S.",
eprint = "1506.06726",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jun,
primaryClass = "cs.CL",
title = "{Skip-Thought Vectors}",
year = 2015
}
@inproceedings{manning-2014-stanford-core,
title = "The stanford corenlp natural language processing toolkit.",
author = "Manning, Christopher D and Surdeanu, Mihai and Bauer, John
and Finkel, Jenny Rose and Bethard, Steven and McClosky,
David",
booktitle = "ACL (System Demonstrations)",
pages = "55-60",
year = 2014
}
@article{graves-2005-frame-phone,
title = "Framewise phoneme classification with bidirectional LSTM and
other neural network architectures",
author = "Graves, Alex and Schmidhuber, J{\"u}rgen",
journal = "Neural Networks",
volume = 18,
number = 5,
pages = "602-610",
year = 2005,
publisher = "Elsevier"
}
@inproceedings{graves-2006-conn-temp,
title = "Connectionist temporal classification: labelling unsegmented
sequence data with recurrent neural networks",
author = "Graves, Alex and Fern{\'a}ndez, Santiago and Gomez, Faustino
and Schmidhuber, J{\"u}rgen",
booktitle = "Proceedings of the 23rd international conference on Machine
learning",
pages = "369-376",
year = 2006,
organization = "ACM"
}
@ARTICLE{szegedy-2015-rethin-incep,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151200567S",
archivePrefix= "arXiv",
author = "{Szegedy}, C. and {Vanhoucke}, V. and {Ioffe}, S. and
{Shlens}, J. and {Wojna}, Z.",
eprint = "1512.00567",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = dec,
primaryClass = "cs.CV",
title = "{Rethinking the Inception Architecture for Computer Vision}",
year = 2015
}
@article{davis-1959-leonhard-euler,
title = "Leonhard euler's integral: A historical profile of the gamma
function: In memoriam: Milton abramowitz",
author = "Davis, Philip J",
journal = "The American Mathematical Monthly",
volume = 66,
number = 10,
pages = "849-869",
year = 1959,
publisher = "JSTOR"
}
@article{heinrich-2008-param-esti,
title = "Parameter estimation for text analysis",
author = "Heinrich, Gregor",
journal = "''",
year = 2008
}
@article{takacs-2007-major-comp,
title = "Major components of the gravity recommendation system",
author = "Tak{\'a}cs, G{\'a}bor and Pil{\'a}szy, Istv{\'a}n and
N{\'e}meth, Botty{\'a}n and Tikk, Domonkos",
journal = "ACM SIGKDD Explorations Newsletter",
volume = 9,
number = 2,
pages = "80-83",
year = 2007,
publisher = "ACM"
}
@article{takacs-2007-gravity-recomm,
title = "On the Gravity Recommendation System",
author = "Takacs, Gabor and Pilaszy, Istvan and Nemeth, Bottyan and
Tikk, Domonkos",
year = 2007,
publisher = "Citeseer",
journal = "''"
}
@inproceedings{yang-2015-net-repr,
title = "Network representation learning with rich text information",
author = "Yang, Cheng and Liu, Zhiyuan and Zhao, Deli and Sun, Maosong
and Chang, Edward",
booktitle = "Twenty-Fourth International Joint Conference on Artificial
Intelligence",
year = 2015
}
@ARTICLE{perozzi-2014-deepw,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1403.6652P",
archivePrefix= "arXiv",
author = "{Perozzi}, B. and {Al-Rfou}, R. and {Skiena}, S.",
eprint = "1403.6652",
journal = "ArXiv e-prints",
keywords = "Computer Science - Social and Information Networks, Computer
Science - Learning, H.2.8, I.2.6, I.5.1",
month = mar,
title = "{DeepWalk: Online Learning of Social Representations}",
year = 2014
}
@article{tibshirani-1996-regres-shrink,
title = "Regression shrinkage and selection via the lasso",
author = "Tibshirani, Robert",
journal = "Journal of the Royal Statistical Society. Series B
(Methodological)",
pages = "267-288",
year = 1996,
publisher = "JSTOR"
}
@ARTICLE{collobert-2011-natur-languag,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2011arXiv1103.0398C",
archivePrefix= "arXiv",
author = "{Collobert}, R. and {Weston}, J. and {Bottou}, L. and
{Karlen}, M. and {Kavukcuoglu}, K. and {Kuksa}, P.",
eprint = "1103.0398",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = mar,
primaryClass = "cs.LG",
title = "{Natural Language Processing (almost) From Scratch}",
year = 2011
}
@ARTICLE{jin-2017-how-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170300887J",
archivePrefix= "arXiv",
author = "{Jin}, C. and {Ge}, R. and {Netrapalli}, P. and {Kakade},
S.~M. and {Jordan}, M.~I.",
eprint = "1703.00887",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Mathematics - Optimization and
Control, Statistics - Machine Learning",
month = mar,
primaryClass = "cs.LG",
title = "{How To Escape Saddle Points Efficiently}",
year = 2017
}
@inproceedings{mihalcea-2004-textrank-bring,
title = "TextRank: Bringing order into texts",
author = "Mihalcea, Rada and Tarau, Paul",
year = 2004,
organization = "Association for Computational Linguistics",
booktitle = "''"
}
@article{wang-2009-more-suit,
title = "Which is More Suitable for Chinese Word Segmentation, the
Generative Model or the Discriminative One? F",
author = "Wang, Kun and Zong, Chengqing",
year = 2009,
journal = "''"
}
@inproceedings{wang-2011-online-varia,
title = "Online variational inference for the hierarchical Dirichlet
process",
author = "Wang, Chong and Paisley, John and Blei, David",
booktitle = "Proceedings of the Fourteenth International Conference on
Artificial Intelligence and Statistics",
pages = "752-760",
year = 2011
}
@inproceedings{hoffman-2010-online-learn,
title = "Online learning for latent dirichlet allocation",
author = "Hoffman, Matthew and Bach, Francis R and Blei, David M",
booktitle = "advances in neural information processing systems",
pages = "856-864",
year = 2010
}
@article{yamamoto-2001-using-suffix,
title = "Using suffix arrays to compute term frequency and document
frequency for all substrings in a corpus",
author = "Yamamoto, Mikio and Church, Kenneth W",
journal = "Computational Linguistics",
volume = 27,
number = 1,
pages = "1-30",
year = 2001,
publisher = "MIT press"
}
@article{etzioni-2005-unsupervised-named-entity,
title = "Unsupervised named-entity extraction from the web: An
experimental study",
author = "Etzioni, Oren and Cafarella, Michael and Downey, Doug and
Popescu, Ana-Maria and Shaked, Tal and Soderland, Stephen and
Weld, Daniel S and Yates, Alexander",
journal = "Artificial intelligence",
volume = 165,
number = 1,
pages = "91-134",
year = 2005,
publisher = "Elsevier"
}
@inproceedings{singh-2010-minimally-super,
title = "Minimally-supervised extraction of entities from text
advertisements",
author = "Singh, Sameer and Hillard, Dustin and Leggetter, Chris",
booktitle = "Human Language Technologies: The 2010 Annual Conference of
the North American Chapter of the Association for
Computational Linguistics",
pages = "73-81",
year = 2010,
organization = "Association for Computational Linguistics"
}
@inproceedings{liu-2011-recogn-named,
title = "Recognizing named entities in tweets",
author = "Liu, Xiaohua and Zhang, Shaodian and Wei, Furu and Zhou,
Ming",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies-Volume
1",
pages = "359-367",
year = 2011,
organization = "Association for Computational Linguistics"
}
@ARTICLE{lample-2016-neural-archit,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160301360L",
archivePrefix= "arXiv",
author = "{Lample}, G. and {Ballesteros}, M. and {Subramanian}, S. and
{Kawakami}, K. and {Dyer}, C.",
eprint = "1603.01360",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = mar,
primaryClass = "cs.CL",
title = "{Neural Architectures for Named Entity Recognition}",
year = 2016
}
@ARTICLE{rei-2016-atten-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161104361R",
archivePrefix= "arXiv",
author = "{Rei}, M. and {Crichton}, G.~K.~O. and {Pyysalo}, S.",
eprint = "1611.04361",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing, I.5.1,
I.2.6, I.2.7",
month = nov,
primaryClass = "cs.CL",
title = "{Attending To Characters in Neural Sequence Labeling Models}",
year = 2016
}
@inproceedings{bharadwaj-2016-phono-aware,
title = "Phonologically Aware Neural Model for Named Entity
Recognition in Low Resource Transfer Settings",
author = "Akash Bharadwaj and David R. Mortensen and Chris Dyer and
Jaime G. Carbonell",
booktitle = "EMNLP",
year = 2016
}
@ARTICLE{yang-2017-trans-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170306345Y",
archivePrefix= "arXiv",
author = "{Yang}, Z. and {Salakhutdinov}, R. and {Cohen}, W.~W.",
eprint = "1703.06345",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = mar,
primaryClass = "cs.CL",
title = "{Transfer Learning for Sequence Tagging With Hierarchical
Recurrent Networks}",
year = 2017
}
@ARTICLE{peters-2017-semi-super,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170500108P",
archivePrefix= "arXiv",
author = "{Peters}, M.~E. and {Ammar}, W. and {Bhagavatula}, C. and
{Power}, R.",
eprint = "1705.00108",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = apr,
primaryClass = "cs.CL",
title = "{Semi-Supervised Sequence Tagging With Bidirectional Language
models}",
year = 2017
}
@article{rose-2010-auto-key,
title = "Automatic keyword extraction from individual documents",
author = "Rose, Stuart and Engel, Dave and Cramer, Nick and Cowley,
Wendy",
journal = "Text Mining",
pages = "1-20",
year = 2010
}
@inproceedings{hasan-2014-auto-key,
title = "Automatic Keyphrase Extraction: A Survey of the State of the
Art.",
author = "Hasan, Kazi Saidul and Ng, Vincent",
booktitle = "ACL (1)",
pages = "1262-1273",
year = 2014
}
@inproceedings{yih-2006-find-advert,
title = "Finding advertising keywords on web pages",
author = "Yih, Wen-tau and Goodman, Joshua and Carvalho, Vitor R",
booktitle = "Proceedings of the 15th international conference on World
Wide Web",
pages = "213-222",
year = 2006,
organization = "ACM"
}
@inproceedings{jiang-2009-rank-approach,
title = "A ranking approach to keyphrase extraction",
author = "Jiang, Xin and Hu, Yunhua and Li, Hang",
booktitle = "Proceedings of the 32nd international ACM SIGIR conference on
Research and development in information retrieval",
pages = "756-757",
year = 2009,
organization = "ACM"
}
@inproceedings{liu-2009-unsupervised-approach,
title = "Unsupervised approaches for automatic keyword extraction
using meeting transcripts",
author = "Liu, Feifan and Pennell, Deana and Liu, Fei and Liu, Yang",
booktitle = "Proceedings of human language technologies: The 2009 annual
conference of the North American chapter of the association
for computational linguistics",
pages = "620-628",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{witten-1999-kea,
author = "Witten, Ian H. and Paynter, Gordon W. and Frank, Eibe and
Gutwin, Carl and Nevill-Manning, Craig G.",
title = "KEA: Practical Automatic Keyphrase Extraction",
booktitle = "Proceedings of the Fourth ACM Conference on Digital
Libraries",
series = "DL '99",
year = 1999,
isbn = "1-58113-145-3",
location = "Berkeley, California, USA",
pages = "254-255",
numpages = 2,
url = "http://doi.acm.org/10.1145/313238.313437",
doi = "10.1145/313238.313437",
acmid = 313437,
publisher = "ACM",
address = "New York, NY, USA"
}
@inproceedings{liu-2010-auto-key,
title = "Automatic keyphrase extraction via topic decomposition",
author = "Liu, Zhiyuan and Huang, Wenyi and Zheng, Yabin and Sun,
Maosong",
booktitle = "Proceedings of the 2010 conference on empirical methods in
natural language processing",
pages = "366-376",
year = 2010,
organization = "Association for Computational Linguistics"
}
@article{chuang-2012-without-cluster,
title = "“Without the Clutter of Unimportant Words”: Descriptive
keyphrases for text visualization",
author = "Chuang, Jason and Manning, Christopher D and Heer, Jeffrey",
journal = "ACM Transactions on Computer-Human Interaction (TOCHI)",
volume = 19,
number = 3,
pages = 19,
year = 2012,
publisher = "ACM"
}
@inproceedings{mei-2010-divrank,
title = "Divrank: the interplay of prestige and diversity in
information networks",
author = "Mei, Qiaozhu and Guo, Jian and Radev, Dragomir",
booktitle = "Proceedings of the 16th ACM SIGKDD international conference
on Knowledge discovery and data mining",
pages = "1009-1018",
year = 2010,
organization = "Acm"
}
@inproceedings{hasan-2010-conundrums-unspervised,
title = "Conundrums in unsupervised keyphrase extraction: making sense
of the state-of-the-art",
author = "Hasan, Kazi Saidul and Ng, Vincent",
booktitle = "Proceedings of the 23rd International Conference on
Computational Linguistics: Posters",
pages = "365-373",
year = 2010,
organization = "Association for Computational Linguistics"
}
@inproceedings{wan-2008-single-doc,
title = "Single document keyphrase extraction using neighborhood
knowledge",
author = "Wan, Xiaojun and Xiao, Jianguo",
booktitle = "Proceedings of the 23rd national conference on Artificial
intelligence-Volume 2",
pages = "855-860",
year = 2008,
organization = "AAAI Press"
}
@inproceedings{wan-2008-collabrank,
title = "CollabRank: towards a collaborative approach to
single-document keyphrase extraction",
author = "Wan, Xiaojun and Xiao, Jianguo",
booktitle = "Proceedings of the 22nd International Conference on
Computational Linguistics-Volume 1",
pages = "969-976",
year = 2008,
organization = "Association for Computational Linguistics"
}
@techreport{page-1999-page-rank,
title = "The PageRank citation ranking: Bringing order to the web.",
author = "Page, Lawrence and Brin, Sergey and Motwani, Rajeev and
Winograd, Terry",
year = 1999,
institution = "Stanford InfoLab"
}
@ARTICLE{barrios-2016-variat-simil,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160203606B",
archivePrefix= "arXiv",
author = "{Barrios}, F. and {L{\'o}pez}, F. and {Argerich}, L. and
{Wachenchauzer}, R.",
eprint = "1602.03606",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Information Retrieval, I.2.7",
month = feb,
primaryClass = "cs.CL",
title = "{Variations of the Similarity Function of Textrank for
Automated Summarization}",
year = 2016
}
@article{gimpel-2006-model-topics,
title = "Modeling Topics",
author = "Gimpel, Kevin",
year = 2006,
journal = "''"
}
@ARTICLE{salimans-2016-weigh-normal,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160207868S",
archivePrefix= "arXiv",
author = "{Salimans}, T. and {Kingma}, D.~P.",
eprint = "1602.07868",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Artificial
Intelligence, Computer Science - Neural and Evolutionary
Computing",
month = feb,
primaryClass = "cs.LG",
title = "{Weight Normalization: A Simple Reparameterization To
Accelerate Training of Deep Neural Networks}",
year = 2016
}
@ARTICLE{lei-2016-layer-normal,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160706450L",
archivePrefix= "arXiv",
author = "{Lei Ba}, J. and {Kiros}, J.~R. and {Hinton}, G.~E.",
eprint = "1607.06450",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = jul,
primaryClass = "stat.ML",
title = "{Layer Normalization}",
year = 2016
}
@ARTICLE{ioffe-2015-batch-normal,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150203167I",
archivePrefix= "arXiv",
author = "{Ioffe}, S. and {Szegedy}, C.",
eprint = "1502.03167",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = feb,
primaryClass = "cs.LG",
title = "{Batch Normalization: Accelerating Deep Network Training By
Reducing Internal Covariate Shift}",
year = 2015
}
@article{shimodaira-2000-improv-predict,
title = "Improving predictive inference under covariate shift by
weighting the log-likelihood function",
author = "Shimodaira, Hidetoshi",
journal = "Journal of statistical planning and inference",
volume = 90,
number = 2,
pages = "227-244",
year = 2000,
publisher = "Elsevier"
}
@ARTICLE{mikolov-2013-exploit-simil,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1309.4168M",
archivePrefix= "arXiv",
author = "{Mikolov}, T. and {Le}, Q.~V. and {Sutskever}, I.",
eprint = "1309.4168",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = sep,
primaryClass = "cs.CL",
title = "{Exploiting Similarities Among Languages for Machine
Translation}",
year = 2013
}
@ARTICLE{deng-2016-image-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160904938D",
archivePrefix= "arXiv",
author = "{Deng}, Y. and {Kanervisto}, A. and {Ling}, J. and {Rush},
A.~M.",
eprint = "1609.04938",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary
Computing",
month = sep,
primaryClass = "cs.CV",
title = "{Image-To-Markup Generation With Coarse-To-Fine Attention}",
year = 2016
}
@inproceedings{minka-2001-automatic-choice,
title = "Automatic choice of dimensionality for PCA",
author = "Minka, Thomas P",
booktitle = "Advances in neural information processing systems",
pages = "598-604",
year = 2001
}
@ARTICLE{le-2014-distr-repres-senten-docum,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1405.4053L",
archivePrefix= "arXiv",
author = "{Le}, Q.~V. and {Mikolov}, T.",
eprint = "1405.4053",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning",
month = may,
primaryClass = "cs.CL",
title = "{Distributed Representations of Sentences and Documents}",
year = 2014
}
@article{arora-2016-simple-tough,
title = {A simple but tough-to-beat baseline for sentence embeddings},
author = {Arora, Sanjeev and Liang, Yingyu and Ma, Tengyu},
year = 2016,
journal = ""
}
@ARTICLE{bojanowski-2016-fasttext,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160704606B",
archivePrefix= "arXiv",
author = "{Bojanowski}, P. and {Grave}, E. and {Joulin}, A. and
{Mikolov}, T.",
eprint = "1607.04606",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jul,
primaryClass = "cs.CL",
title = "{Enriching Word Vectors With Subword Information}",
year = 2016
}
@ARTICLE{srivastava-2015-highw-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150500387S",
archivePrefix= "arXiv",
author = "{Srivastava}, R.~K. and {Greff}, K. and {Schmidhuber}, J.",
eprint = "1505.00387",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing, 68T01, I.2.6, G.1.6",
month = may,
primaryClass = "cs.LG",
title = "{Highway Networks}",
year = 2015
}
@ARTICLE{kalchbrenner-2014-convol-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1404.2188K",
archivePrefix= "arXiv",
author = "{Kalchbrenner}, N. and {Grefenstette}, E. and {Blunsom}, P.",
eprint = "1404.2188",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = apr,
primaryClass = "cs.CL",
title = "{A Convolutional Neural Network for Modelling Sentences}",
year = 2014
}
@InProceedings{matt-2015-word-embed,
title = "From Word Embeddings To Document Distances",
author = "Matt Kusner and Yu Sun and Nicholas Kolkin and Kilian
Weinberger",
booktitle = "Proceedings of the 32nd International Conference on Machine
Learning",
pages = "957-966",
year = 2015,
editor = "Francis Bach and David Blei",
volume = 37,
series = "Proceedings of Machine Learning Research",
address = "Lille, France",
month = "07--09 Jul",
publisher = "PMLR",
pdf = "http://proceedings.mlr.press/v37/kusnerb15.pdf",
url = "http://proceedings.mlr.press/v37/kusnerb15.html",
abstract = "We present the Word Mover’s Distance (WMD), a novel distance
function between text documents. Our work is based on recent
results in word embeddings that learn semantically meaningful
representations for words from local co-occurrences in
sentences. The WMD distance measures the dissimilarity
between two text documents as the minimum amount of distance
that the embedded words of one document need to ``travel'' to
reach the embedded words of another document. We show that
this distance metric can be cast as an instance of the Earth
Mover’s Distance, a well studied transportation problem for
which several highly efficient solvers have been
developed. Our metric has no hyperparameters and is
straight-forward to implement. Further, we demonstrate on
eight real world document classification data sets, in
comparison with seven state-of-the-art baselines, that the
WMD metric leads to unprecedented low k-nearest neighbor
document classification error rates."
}
@ARTICLE{brokos-2016-using-centr,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160803905B",
archivePrefix= "arXiv",
author = "{Brokos}, G.-I. and {Malakasiotis}, P. and {Androutsopoulos},
I.",
eprint = "1608.03905",
journal = "ArXiv e-prints",
keywords = "Computer Science - Information Retrieval",
month = aug,
primaryClass = "cs.IR",
title = "{Using Centroids of Word Embeddings and Word Mover's Distance
for Biomedical Document Retrieval in Question Answering}",
year = 2016
}
@ARTICLE{dai-2015-docum-embed,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150707998D",
archivePrefix= "arXiv",
author = "{Dai}, A.~M. and {Olah}, C. and {Le}, Q.~V.",
eprint = "1507.07998",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning",
month = jul,
primaryClass = "cs.CL",
title = "{Document Embedding With Paragraph Vectors}",
year = 2015
}
@ARTICLE{lau-2016-empir-evaluat,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160705368L",
archivePrefix= "arXiv",
author = "{Lau}, J.~H. and {Baldwin}, T.",
eprint = "1607.05368",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jul,
primaryClass = "cs.CL",
title = "{An Empirical Evaluation of Doc2vec With Practical Insights
Into Document Embedding Generation}",
year = 2016
}
@inproceedings{polajnar-2015-exploration-discourse,
title = "An exploration of discourse-based sentence spaces for
compositional distributional semantics",
author = "Polajnar, Tamara and Rimell, Laura and Clark, Stephen",
booktitle = "Workshop on Linking Models of Lexical, Sentential and
Discourse-level Semantics (LSDSem)",
pages = 1,
year = 2015
}
@inproceedings{socher-2011-semi-supervised,
title = "Semi-supervised recursive autoencoders for predicting
sentiment distributions",
author = "Socher, Richard and Pennington, Jeffrey and Huang, Eric H and
Ng, Andrew Y and Manning, Christopher D",
booktitle = "Proceedings of the conference on empirical methods in natural
language processing",
pages = "151-161",
year = 2011,
organization = "Association for Computational Linguistics"
}
@article{hodosh-2013-framing-image,
title = "Framing image description as a ranking task: Data, models and
evaluation metrics",
author = "Hodosh, Micah and Young, Peter and Hockenmaier, Julia",
journal = "Journal of Artificial Intelligence Research",
volume = 47,
pages = "853-899",
year = 2013
}
@inproceedings{shen-2014-latent-semantic,
title = "A latent semantic model with convolutional-pooling structure
for information retrieval",
author = "Shen, Yelong and He, Xiaodong and Gao, Jianfeng and Deng, Li
and Mesnil, Gr{\'e}goire",
booktitle = "Proceedings of the 23rd ACM International Conference on
Conference on Information and Knowledge Management",
pages = "101-110",
year = 2014,
organization = "ACM"
}
@ARTICLE{xiong-2016-dynam-memor,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160301417X",
archivePrefix= "arXiv",
author = "{Xiong}, C. and {Merity}, S. and {Socher}, R.",
eprint = "1603.01417",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Computation and Language, Computer Science
- Computer Vision and Pattern Recognition",
month = mar,
title = "{Dynamic Memory Networks for Visual and Textual Question
Answering}",
year = 2016
}
@ARTICLE{zeng-2016-effic-summar,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161103382Z",
archivePrefix= "arXiv",
author = "{Zeng}, W. and {Luo}, W. and {Fidler}, S. and {Urtasun}, R.",
eprint = "1611.03382",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = nov,
primaryClass = "cs.CL",
title = "{Efficient Summarization With Read-Again and Copy Mechanism}",
year = 2016
}
@ARTICLE{lai-2015-how-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150705523L",
archivePrefix= "arXiv",
author = "{Lai}, S. and {Liu}, K. and {Xu}, L. and {Zhao}, J.",
eprint = "1507.05523",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jul,
primaryClass = "cs.CL",
title = "{How To Generate a Good Word Embedding?}",
year = 2015
}
@inproceedings{chen-2015-revisit-word,
title = "Revisiting Word Embedding for Contrasting Meaning",
author = "Zhigang Chen and Wei Lin and Qian Chen and Xiaoping Chen and
Si Wei and Hui Jiang and Xiao-Dan Zhu",
booktitle = "ACL",
year = 2015
}
@inproceedings{lazaridou-2015-hubness-pollution,
title = "Hubness and Pollution: Delving into Cross-Space Mapping for
Zero-Shot Learning",
author = "Angeliki Lazaridou and Georgiana Dinu and Marco Baroni",
booktitle = "ACL",
year = 2015
}
@ARTICLE{yin-2017-compar-study,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170201923Y",
archivePrefix= "arXiv",
author = "{Yin}, W. and {Kann}, K. and {Yu}, M. and {Sch{\"u}tze}, H.",
eprint = "1702.01923",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = feb,
primaryClass = "cs.CL",
title = "{Comparative Study of Cnn and Rnn for Natural Language
Processing}",
year = 2017
}
@ARTICLE{zhang-2015-sensit-analy,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151003820Z",
archivePrefix= "arXiv",
author = "{Zhang}, Y. and {Wallace}, B.",
eprint = "1510.03820",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing",
month = oct,
primaryClass = "cs.CL",
title = "{A Sensitivity Analysis of (and Practitioners' Guide to)
Convolutional Neural Networks for Sentence Classification}",
year = 2015
}
@ARTICLE{johnson-2014-effec-use,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.1058J",
archivePrefix= "arXiv",
author = "{Johnson}, R. and {Zhang}, T.",
eprint = "1412.1058",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Statistics - Machine Learning",
month = dec,
primaryClass = "cs.CL",
title = "{Effective Use of Word Order for Text Categorization With
Convolutional Neural Networks}",
year = 2014
}
@ARTICLE{johnson-2015-semi-super,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150401255J",
archivePrefix= "arXiv",
author = "{Johnson}, R. and {Zhang}, T.",
eprint = "1504.01255",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Computation
and Language, Computer Science - Learning",
month = apr,
primaryClass = "stat.ML",
title = "{Semi-Supervised Convolutional Neural Networks for Text
Categorization Via Region Embedding}",
year = 2015
}
@ARTICLE{zhang-2015-charac-level,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150901626Z",
archivePrefix= "arXiv",
author = "{Zhang}, X. and {Zhao}, J. and {LeCun}, Y.",
eprint = "1509.01626",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = sep,
primaryClass = "cs.LG",
title = "{Character-Level Convolutional Networks for Text
Classification}",
year = 2015
}
@ARTICLE{zhang-2015-text-under-from-scrat,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150201710Z",
archivePrefix= "arXiv",
author = "{Zhang}, X. and {LeCun}, Y.",
eprint = "1502.01710",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = feb,
primaryClass = "cs.LG",
title = "{Text Understanding From Scratch}",
year = 2015
}
@article{schuster-1997-bidirectional-recurrent,
title = "Bidirectional recurrent neural networks",
author = "Schuster, Mike and Paliwal, Kuldip K",
journal = "IEEE Transactions on Signal Processing",
volume = 45,
number = 11,
pages = "2673-2681",
year = 1997,
publisher = "IEEE"
}
@article{chen-2015-event-extract,
title = "Event Extraction via Dynamic Multi-Pooling Convolutional
Neural Networks",
author = "Chen, Yubo and Xu, Liheng and Liu, Kang and Zeng, Daojian and
Zhao, Jun and others",
year = 2015,
journal = "''"
}
@ARTICLE{bengio-2012-repres-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1206.5538B",
archivePrefix= "arXiv",
author = "{Bengio}, Y. and {Courville}, A. and {Vincent}, P.",
eprint = "1206.5538",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = jun,
primaryClass = "cs.LG",
title = "{Representation Learning: A Review and New Perspectives}",
year = 2012
}
@inproceedings{le-2011-ica-recons,
title = "ICA with reconstruction cost for efficient overcomplete
feature learning",
author = "Le, Quoc V and Karpenko, Alexandre and Ngiam, Jiquan and Ng,
Andrew Y",
booktitle = "Advances in Neural Information Processing Systems",
pages = "1017-1025",
year = 2011
}
@ARTICLE{goodfellow-2013-maxout-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1302.4389G",
archivePrefix= "arXiv",
author = "{Goodfellow}, I.~J. and {Warde-Farley}, D. and {Mirza},
M. and {Courville}, A. and {Bengio}, Y.",
eprint = "1302.4389",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = feb,
primaryClass = "stat.ML",
title = "{Maxout Networks}",
year = 2013
}
@ARTICLE{he-2015-delvin-deep-into-rectif,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150201852H",
archivePrefix= "arXiv",
author = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
eprint = "1502.01852",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Artificial Intelligence, Computer Science
- Learning",
month = feb,
primaryClass = "cs.CV",
title = "{Delving Deep Into Rectifiers: Surpassing Human-Level
Performance on Imagenet Classification}",
year = 2015
}
@ARTICLE{schmidhuber-2014-deep-learn-neural-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1404.7828S",
archivePrefix= "arXiv",
author = "{Schmidhuber}, J.",
eprint = "1404.7828",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Learning",
month = apr,
title = "{Deep Learning in Neural Networks: An Overview}",
year = 2014
}
@article{dahl-2012-context-depend,
title = "Context-dependent pre-trained deep neural networks for
large-vocabulary speech recognition",
author = "Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex",
journal = "IEEE Transactions on audio, speech, and language processing",
volume = 20,
number = 1,
pages = "30-42",
year = 2012,
publisher = "IEEE"
}
@ARTICLE{romero-2014-fitnet,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.6550R",
archivePrefix= "arXiv",
author = "{Romero}, A. and {Ballas}, N. and {Ebrahimi Kahou}, S. and
{Chassang}, A. and {Gatta}, C. and {Bengio}, Y.",
eprint = "1412.6550",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = dec,
primaryClass = "cs.LG",
title = "{FitNets: Hints for Thin Deep Nets}",
year = 2014
}
@ARTICLE{srivastava-2015-train-very-deep-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150706228S",
archivePrefix= "arXiv",
author = "{Srivastava}, R.~K. and {Greff}, K. and {Schmidhuber}, J.",
eprint = "1507.06228",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing, 68T01, I.2.6, G.1.6",
month = jul,
primaryClass = "cs.LG",
title = "{Training Very Deep Networks}",
year = 2015
}
@ARTICLE{huang-2016-densel-connec-convol-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160806993H",
archivePrefix= "arXiv",
author = "{Huang}, G. and {Liu}, Z. and {Weinberger}, K.~Q. and {van
der Maaten}, L.",
eprint = "1608.06993",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning",
month = aug,
primaryClass = "cs.CV",
title = "{Densely Connected Convolutional Networks}",
year = 2016
}
@ARTICLE{he-2016-ident-mappin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160305027H",
archivePrefix= "arXiv",
author = "{He}, K. and {Zhang}, X. and {Ren}, S. and {Sun}, J.",
eprint = "1603.05027",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning",
month = mar,
primaryClass = "cs.CV",
title = "{Identity Mappings in Deep Residual Networks}",
year = 2016
}
@ARTICLE{veit-2016-resid-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160506431V",
archivePrefix= "arXiv",
author = "{Veit}, A. and {Wilber}, M. and {Belongie}, S.",
eprint = "1605.06431",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Artificial Intelligence, Computer Science
- Learning, Computer Science - Neural and Evolutionary
Computing",
month = may,
primaryClass = "cs.CV",
title = "{Residual Networks Behave Like Ensembles of Relatively
Shallow Networks}",
year = 2016
}
@ARTICLE{zagoruyko-2016-wide-resid-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160507146Z",
archivePrefix= "arXiv",
author = "{Zagoruyko}, S. and {Komodakis}, N.",
eprint = "1605.07146",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = may,
primaryClass = "cs.CV",
title = "{Wide Residual Networks}",
year = 2016
}
@ARTICLE{telgarsky-2016-benef-depth-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160204485T",
archivePrefix= "arXiv",
author = "{Telgarsky}, M.",
eprint = "1602.04485",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.LG",
title = "{Benefits of Depth in Neural networks}",
year = 2016
}
@ARTICLE{huang-2016-deep-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160309382H",
archivePrefix= "arXiv",
author = "{Huang}, G. and {Sun}, Y. and {Liu}, Z. and {Sedra}, D. and
{Weinberger}, K.",
eprint = "1603.09382",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computer
Vision and Pattern Recognition, Computer Science - Neural and
Evolutionary Computing",
month = mar,
primaryClass = "cs.LG",
title = "{Deep Networks With Stochastic Depth}",
year = 2016
}
@ARTICLE{eldan-2015-power-depth,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151203965E",
archivePrefix= "arXiv",
author = "{Eldan}, R. and {Shamir}, O.",
eprint = "1512.03965",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing, Statistics - Machine Learning",
month = dec,
primaryClass = "cs.LG",
title = "{The Power of Depth for Feedforward Neural Networks}",
year = 2015
}
@ARTICLE{liao-2016-bridg-gaps,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160403640L",
archivePrefix= "arXiv",
author = "{Liao}, Q. and {Poggio}, T.",
eprint = "1604.03640",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = apr,
primaryClass = "cs.LG",
title = "{Bridging the Gaps Between Residual Learning, Recurrent
Neural Networks and Visual Cortex}",
year = 2016
}
@ARTICLE{greff-2016-highw-resid,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161207771G",
archivePrefix= "arXiv",
author = "{Greff}, K. and {Srivastava}, R.~K. and {Schmidhuber}, J.",
eprint = "1612.07771",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Artificial Intelligence, Computer Science
- Learning, I.2.6, I.5.1",
month = dec,
title = "{Highway and Residual Networks Learn Unrolled Iterative
Estimation}",
year = 2016
}
@ARTICLE{xie-2016-aggreg-resid,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161105431X",
archivePrefix= "arXiv",
author = "{Xie}, S. and {Girshick}, R. and {Doll{\'a}r}, P. and {Tu},
Z. and {He}, K.",
eprint = "1611.05431",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = nov,
primaryClass = "cs.CV",
title = "{Aggregated Residual Transformations for Deep Neural
Networks}",
year = 2016
}
@ARTICLE{alain-2016-under-inter,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161001644A",
archivePrefix= "arXiv",
author = "{Alain}, G. and {Bengio}, Y.",
eprint = "1610.01644",
journal = "ArXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = oct,
primaryClass = "stat.ML",
title = "{Understanding Intermediate Layers Using Linear Classifier
probes}",
year = 2016
}
@ARTICLE{yosinski-2014-how-trans,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1411.1792Y",
archivePrefix= "arXiv",
author = "{Yosinski}, J. and {Clune}, J. and {Bengio}, Y. and {Lipson},
H.",
eprint = "1411.1792",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = nov,
primaryClass = "cs.LG",
title = "{How Transferable Are Features in Deep Neural networks?}",
year = 2014
}
@inproceedings{levy-2014-neural-word,
title = "Neural Word Embedding as Implicit Matrix Factorization",
author = "Levy, Omer and Goldberg, Yoav",
booktitle = "Advances in Neural Information Processing Systems 27",
editor = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
and K. Q. Weinberger",
pages = "2177-2185",
year = 2014,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/5477-neural-word-embedding-as-implicit-matrix-factorization.pdf"
}
@ARTICLE{dyer-2014-notes-noise,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.8251D",
archivePrefix= "arXiv",
author = "{Dyer}, C.",
eprint = "1410.8251",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = oct,
primaryClass = "cs.LG",
title = "{Notes on Noise Contrastive Estimation and Negative
Sampling}",
year = 2014
}
@inproceedings{levy-2014-ling-regul,
title = "Linguistic Regularities in Sparse and Explicit Word
Representations",
author = "Levy, Omer and Goldberg, Yoav and Ramat-Gan, Israel",
booktitle = "CoNLL",
pages = "171-180",
year = 2014
}
@ARTICLE{arora-2015-rand-walk,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150203520A",
archivePrefix= "arXiv",
author = "{Arora}, S. and {Li}, Y. and {Liang}, Y. and {Ma}, T. and
{Risteski}, A.",
eprint = "1502.03520",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.LG",
title = "{RAND-WALK: A Latent Variable Model Approach To Word
Embeddings}",
year = 2015
}
@ARTICLE{saxe-2013-exact-solut,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.6120S",
archivePrefix= "arXiv",
author = "{Saxe}, A.~M. and {McClelland}, J.~L. and {Ganguli}, S.",
eprint = "1312.6120",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Condensed Matter - Disordered Systems and Neural Networks,
Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Quantitative Biology - Neurons
and Cognition, Statistics - Machine Learning",
month = dec,
title = "{Exact Solutions To the Nonlinear Dynamics of Learning in
Deep Linear Neural networks}",
year = 2013
}
@ARTICLE{mishkin-2015-all-you,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106422M",
archivePrefix= "arXiv",
author = "{Mishkin}, D. and {Matas}, J.",
eprint = "1511.06422",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = nov,
primaryClass = "cs.LG",
title = "{All You Need Is a Good init}",
year = 2015
}
@ARTICLE{kraehenbuehl-2015-data-depen,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106856K",
archivePrefix= "arXiv",
author = "{Kr{\"a}henb{\"u}hl}, P. and {Doersch}, C. and {Donahue},
J. and {Darrell}, T.",
eprint = "1511.06856",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning",
month = nov,
primaryClass = "cs.CV",
title = "{Data-Dependent Initializations of Convolutional Neural
Networks}",
year = 2015
}
@ARTICLE{britz-2017-massiv-explor,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170303906B",
archivePrefix= "arXiv",
author = "{Britz}, D. and {Goldie}, A. and {Luong}, M.-T. and {Le}, Q.",
eprint = "1703.03906",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = mar,
primaryClass = "cs.CL",
title = "{Massive Exploration of Neural Machine Translation
Architectures}",
year = 2017
}
@ARTICLE{neubig-2017-neural-machin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170301619N",
archivePrefix= "arXiv",
author = "{Neubig}, G.",
eprint = "1703.01619",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Statistics - Machine Learning",
month = mar,
primaryClass = "cs.CL",
title = "{Neural Machine Translation and Sequence-To-Sequence Models:
A Tutorial}",
year = 2017
}
@ARTICLE{wu-2016-googl-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160908144W",
archivePrefix= "arXiv",
author = "{Wu}, Y. and {Schuster}, M. and {Chen}, Z. and {Le},
Q.~V. and {Norouzi}, M. and {Macherey}, W. and {Krikun},
M. and {Cao}, Y. and {Gao}, Q. and {Macherey}, K. and
{Klingner}, J. and {Shah}, A. and {Johnson}, M. and {Liu},
X. and {Kaiser}, {\L}. and {Gouws}, S. and {Kato}, Y. and
{Kudo}, T. and {Kazawa}, H. and {Stevens}, K. and {Kurian},
G. and {Patil}, N. and {Wang}, W. and {Young}, C. and
{Smith}, J. and {Riesa}, J. and {Rudnick}, A. and {Vinyals},
O. and {Corrado}, G. and {Hughes}, M. and {Dean}, J.",
eprint = "1609.08144",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning",
month = sep,
primaryClass = "cs.CL",
title = "{Google's Neural Machine Translation System: Bridging the Gap
Between Human and Machine Translation}",
year = 2016
}
@ARTICLE{fang-2014-from-caption,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1411.4952F",
archivePrefix= "arXiv",
author = "{Fang}, H. and {Gupta}, S. and {Iandola}, F. and
{Srivastava}, R. and {Deng}, L. and {Doll{\'a}r}, P. and
{Gao}, J. and {He}, X. and {Mitchell}, M. and {Platt},
J.~C. and {Zitnick}, C.~L. and {Zweig}, G.",
eprint = "1411.4952",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Computation and Language",
month = nov,
primaryClass = "cs.CV",
title = "{From Captions To Visual Concepts and Back}",
year = 2014
}
@ARTICLE{ranzato-2015-mixer,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106732R",
archivePrefix= "arXiv",
author = "{Ranzato}, M. and {Chopra}, S. and {Auli}, M. and {Zaremba},
W.",
eprint = "1511.06732",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = nov,
primaryClass = "cs.LG",
title = "{Sequence Level Training With Recurrent Neural Networks}",
year = 2015
}
@ARTICLE{graves-2012-sequen-trans,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1211.3711G",
archivePrefix= "arXiv",
author = "{Graves}, A.",
eprint = "1211.3711",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Learning, Statistics - Machine Learning",
month = nov,
title = "{Sequence Transduction With Recurrent Neural Networks}",
year = 2012
}
@ARTICLE{zhang-2017-towar-end,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170102720Z",
archivePrefix= "arXiv",
author = "{Zhang}, Y. and {Pezeshki}, M. and {Brakel}, P. and {Zhang},
S. and {Yoshua Bengio}, C.~L. and {Courville}, A.",
eprint = "1701.02720",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Statistics - Machine Learning",
month = jan,
primaryClass = "cs.CL",
title = "{Towards End-To-End Speech Recognition With Deep
Convolutional Neural Networks}",
year = 2017
}
@ARTICLE{bengio-2012-pract-recom,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1206.5533B",
archivePrefix= "arXiv",
author = "{Bengio}, Y.",
eprint = "1206.5533",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = jun,
primaryClass = "cs.LG",
title = "{Practical Recommendations for Gradient-Based Training of
Deep architectures}",
year = 2012
}
@ARTICLE{pascanu-2012-diffic-train,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1211.5063P",
archivePrefix= "arXiv",
author = "{Pascanu}, R. and {Mikolov}, T. and {Bengio}, Y.",
eprint = "1211.5063",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning",
month = nov,
primaryClass = "cs.LG",
title = "{On the Difficulty of Training Recurrent Neural Networks}",
year = 2012
}
@ARTICLE{yosinski-2015-under-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150606579Y",
archivePrefix= "arXiv",
author = "{Yosinski}, J. and {Clune}, J. and {Nguyen}, A. and {Fuchs},
T. and {Lipson}, H.",
eprint = "1506.06579",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = jun,
primaryClass = "cs.CV",
title = "{Understanding Neural Networks Through Deep Visualization}",
year = 2015
}
@ARTICLE{vaswani-2017-transformer,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170603762V",
archivePrefix= "arXiv",
author = "{Vaswani}, A. and {Shazeer}, N. and {Parmar}, N. and
{Uszkoreit}, J. and {Jones}, L. and {Gomez}, A.~N. and
{Kaiser}, L. and {Polosukhin}, I.",
eprint = "1706.03762",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jun,
primaryClass = "cs.CL",
title = "{Attention Is All You Need}",
year = 2017
}
@ARTICLE{semeniuta-2016-recurrent-dropout,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160305118S",
archivePrefix= "arXiv",
author = "{Semeniuta}, S. and {Severyn}, A. and {Barth}, E.",
eprint = "1603.05118",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = mar,
primaryClass = "cs.CL",
title = "{Recurrent Dropout Without Memory Loss}",
year = 2016
}
@ARTICLE{pascanu-2013-how-to,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.6026P",
archivePrefix= "arXiv",
author = "{Pascanu}, R. and {Gulcehre}, C. and {Cho}, K. and {Bengio},
Y.",
eprint = "1312.6026",
journal = "ArXiv e-prints",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Learning, Statistics - Machine Learning",
month = dec,
title = "{How To Construct Deep Recurrent Neural Networks}",
year = 2013
}
@ARTICLE{luong-2014-addres-rare,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.8206L",
archivePrefix= "arXiv",
author = "{Luong}, M.-T. and {Sutskever}, I. and {Le}, Q.~V. and
{Vinyals}, O. and {Zaremba}, W.",
eprint = "1410.8206",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing",
month = oct,
primaryClass = "cs.CL",
title = "{Addressing the Rare Word Problem in Neural Machine
Translation}",
year = 2014
}
@ARTICLE{luo-2017-cosin-normal,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170205870L",
archivePrefix= "arXiv",
author = "{Luo}, C. and {Zhan}, J. and {Wang}, L. and {Yang}, Q.",
eprint = "1702.05870",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Artificial
Intelligence, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.LG",
title = "{Cosine Normalization: Using Cosine Similarity Instead of Dot
Product in Neural Networks}",
year = 2017
}
@ARTICLE{kaiser-2017-one-model,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170605137K",
archivePrefix= "arXiv",
author = "{Kaiser}, L. and {Gomez}, A.~N. and {Shazeer}, N. and
{Vaswani}, A. and {Parmar}, N. and {Jones}, L. and
{Uszkoreit}, J.",
eprint = "1706.05137",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Statistics - Machine Learning",
month = jun,
primaryClass = "cs.LG",
title = "{One Model To Learn Them All}",
year = 2017
}
@ARTICLE{nguyen-2014-deep-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.1897N",
archivePrefix= "arXiv",
author = "{Nguyen}, A. and {Yosinski}, J. and {Clune}, J.",
eprint = "1412.1897",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Artificial Intelligence, Computer Science
- Neural and Evolutionary Computing",
month = dec,
primaryClass = "cs.CV",
title = "{Deep Neural Networks Are Easily Fooled: High Confidence
Predictions for Unrecognizable Images}",
year = 2014
}
@ARTICLE{press-2016-using-output,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160805859P",
archivePrefix= "arXiv",
author = "{Press}, O. and {Wolf}, L.",
eprint = "1608.05859",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL",
title = "{Using the Output Embedding To Improve Language Models}",
year = 2016
}
@misc{hochreiter-2001-gradient-flow,
title = "Gradient flow in recurrent nets: the difficulty of learning
long-term dependencies",
author = "Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo and
Schmidhuber, J{\"u}rgen and others",
year = 2001,
publisher = "A field guide to dynamical recurrent neural networks. IEEE
Press"
}
@ARTICLE{szegedy-2016-incep-v4,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160207261S",
archivePrefix= "arXiv",
author = "{Szegedy}, C. and {Ioffe}, S. and {Vanhoucke}, V. and
{Alemi}, A.",
eprint = "1602.07261",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = feb,
primaryClass = "cs.CV",
title = "{Inception-V4, Inception-Resnet and the Impact of Residual
Connections on Learning}",
year = 2016
}
@ARTICLE{lin-2017-struc-self,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170303130L",
archivePrefix= "arXiv",
author = "{Lin}, Z. and {Feng}, M. and {Nogueira dos Santos}, C. and
{Yu}, M. and {Xiang}, B. and {Zhou}, B. and {Bengio}, Y.",
eprint = "1703.03130",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing",
month = mar,
primaryClass = "cs.CL",
title = "{A Structured Self-Attentive Sentence Embedding}",
year = 2017
}
@ARTICLE{memisevic-2011-learn-to-relat-images,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2011arXiv1110.0107M",
archivePrefix= "arXiv",
author = "{Memisevic}, R.",
eprint = "1110.0107",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Artificial Intelligence, Nonlinear
Sciences - Adaptation and Self-Organizing Systems, Statistics
- Machine Learning",
month = oct,
primaryClass = "cs.CV",
title = "{Learning To Relate Images: Mapping Units, Complex Cells and
Simultaneous eigenspaces}",
year = 2011
}
@ARTICLE{cheng-2016-long-short,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160106733C",
archivePrefix= "arXiv",
author = "{Cheng}, J. and {Dong}, L. and {Lapata}, M.",
eprint = "1601.06733",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = jan,
primaryClass = "cs.CL",
title = "{Long Short-Term Memory-Networks for Machine Reading}",
year = 2016
}
@ARTICLE{paulus-2017-deep-reinf,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170504304P",
archivePrefix= "arXiv",
author = "{Paulus}, R. and {Xiong}, C. and {Socher}, R.",
eprint = "1705.04304",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = may,
primaryClass = "cs.CL",
title = "{A Deep Reinforced Model for Abstractive Summarization}",
year = 2017
}
@ARTICLE{shen-2016-reason,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160905284S",
archivePrefix= "arXiv",
author = "{Shen}, Y. and {Huang}, P.-S. and {Gao}, J. and {Chen}, W.",
eprint = "1609.05284",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = sep,
primaryClass = "cs.LG",
title = "{ReasoNet: Learning To Stop Reading in Machine
Comprehension}",
year = 2016
}
@ARTICLE{golub-2017-two-stage,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170609789G",
archivePrefix= "arXiv",
author = "{Golub}, D. and {Huang}, P.-S. and {He}, X. and {Deng}, L.",
eprint = "1706.09789",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL",
title = "{Two-Stage Synthesis Networks for Transfer Learning in
Machine Comprehension}",
year = 2017
}
@ARTICLE{miller-2016-key-value,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160603126M",
archivePrefix= "arXiv",
author = "{Miller}, A. and {Fisch}, A. and {Dodge}, J. and {Karimi},
A.-H. and {Bordes}, A. and {Weston}, J.",
eprint = "1606.03126",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL",
title = "{Key-Value Memory Networks for Directly Reading Documents}",
year = 2016
}
@ARTICLE{zhang-2016-quest-answer,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160600979Z",
archivePrefix= "arXiv",
author = "{Zhang}, Y. and {Liu}, K. and {He}, S. and {Ji}, G. and
{Liu}, Z. and {Wu}, H. and {Zhao}, J.",
eprint = "1606.00979",
journal = "ArXiv e-prints",
keywords = "Computer Science - Information Retrieval, Computer Science -
Artificial Intelligence, Computer Science - Computation and
Language, Computer Science - Neural and Evolutionary
Computing",
month = jun,
primaryClass = "cs.IR",
title = "{Question Answering Over Knowledge Base With Neural Attention
Combining Global Knowledge Information}",
year = 2016
}
@ARTICLE{nguyen-2016-ms-marco,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161109268N",
archivePrefix= "arXiv",
author = "{Nguyen}, T. and {Rosenberg}, M. and {Song}, X. and {Gao},
J. and {Tiwary}, S. and {Majumder}, R. and {Deng}, L.",
eprint = "1611.09268",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Information Retrieval",
month = nov,
primaryClass = "cs.CL",
title = "{MS Marco: A Human Generated Machine Reading Comprehension
Dataset}",
year = 2016
}
@ARTICLE{zhang-2017-inter-convol-neural-networ,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171000935Z",
archivePrefix= "arXiv",
author = "{Zhang}, Q. and {Nian Wu}, Y. and {Zhu}, S.-C.",
eprint = "1710.00935",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = oct,
primaryClass = "cs.CV",
title = "{Interpretable Convolutional Neural Networks}",
year = 2017
}
@ARTICLE{mnih-2012-fast-simpl,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1206.6426M",
archivePrefix= "arXiv",
author = "{Mnih}, A. and {Whye Teh}, Y.",
eprint = "1206.6426",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jun,
primaryClass = "cs.CL",
title = "{A Fast and Simple Algorithm for Training Neural
Probabilistic Language Models}",
year = 2012
}
@ARTICLE{pagliardini-2017-unsup-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170302507P",
archivePrefix= "arXiv",
author = "{Pagliardini}, M. and {Gupta}, P. and {Jaggi}, M.",
eprint = "1703.02507",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Information Retrieval, I.2.7",
month = mar,
primaryClass = "cs.CL",
title = "{Unsupervised Learning of Sentence Embeddings Using
Compositional N-Gram Features}",
year = 2017
}
@ARTICLE{palangi-2015-deep-senten,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150206922P",
archivePrefix= "arXiv",
author = "{Palangi}, H. and {Deng}, L. and {Shen}, Y. and {Gao}, J. and
{He}, X. and {Chen}, J. and {Song}, X. and {Ward}, R.",
eprint = "1502.06922",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Information Retrieval, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing",
month = feb,
primaryClass = "cs.CL",
title = "{Deep Sentence Embedding Using Long Short-Term Memory
Networks: Analysis and Application To Information Retrieval}",
year = 2015
}
@ARTICLE{maillard-2017-joint-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170509189M",
archivePrefix= "arXiv",
author = "{Maillard}, J. and {Clark}, S. and {Yogatama}, D.",
eprint = "1705.09189",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = may,
primaryClass = "cs.CL",
title = "{Jointly Learning Sentence Embeddings and Syntax With
Unsupervised Tree-LSTMs}",
year = 2017
}
@ARTICLE{dai-2015-semi-super-sequen-learn,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151101432D",
archivePrefix= "arXiv",
author = "{Dai}, A.~M. and {Le}, Q.~V.",
eprint = "1511.01432",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = nov,
primaryClass = "cs.LG",
title = "{Semi-Supervised Sequence Learning}",
year = 2015
}
@ARTICLE{luong-2015-multi-task-seq2seq,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106114L",
archivePrefix= "arXiv",
author = "{Luong}, M.-T. and {Le}, Q.~V. and {Sutskever}, I. and
{Vinyals}, O. and {Kaiser}, L.",
eprint = "1511.06114",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Statistics - Machine Learning",
month = nov,
primaryClass = "cs.LG",
title = "{Multi-Task Sequence To Sequence Learning}",
year = 2015
}
@ARTICLE{li-2015-hierar-neural,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150601057L",
archivePrefix= "arXiv",
author = "{Li}, J. and {Luong}, M.-T. and {Jurafsky}, D.",
eprint = "1506.01057",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL",
title = "{A Hierarchical Neural Autoencoder for Paragraphs and
Documents}",
year = 2015
}
@ARTICLE{hill-2016-learn-distr,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160203483H",
archivePrefix= "arXiv",
author = "{Hill}, F. and {Cho}, K. and {Korhonen}, A.",
eprint = "1602.03483",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = feb,
primaryClass = "cs.CL",
title = "{Learning Distributed Representations of Sentences From
Unlabelled Data}",
year = 2016
}
@ARTICLE{wieting-2015-towar-univer,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151108198W",
archivePrefix= "arXiv",
author = "{Wieting}, J. and {Bansal}, M. and {Gimpel}, K. and
{Livescu}, K.",
eprint = "1511.08198",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = nov,
primaryClass = "cs.CL",
title = "{Towards Universal Paraphrastic Sentence Embeddings}",
year = 2015
}
@ARTICLE{agrawal-2015-vqa,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150500468A",
archivePrefix= "arXiv",
author = "{Agrawal}, A. and {Lu}, J. and {Antol}, S. and {Mitchell},
M. and {Zitnick}, C.~L. and {Batra}, D. and {Parikh}, D.",
eprint = "1505.00468",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Computer Vision and Pattern Recognition",
month = may,
primaryClass = "cs.CL",
title = "{VQA: Visual Question Answering}",
year = 2015
}
@ARTICLE{zhang-2015-yin-yang,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151105099Z",
archivePrefix= "arXiv",
author = "{Zhang}, P. and {Goyal}, Y. and {Summers-Stay}, D. and
{Batra}, D. and {Parikh}, D.",
eprint = "1511.05099",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Computer Vision and Pattern Recognition, Computer Science - Learning",
month = nov,
primaryClass = "cs.CL",
title = "{Yin and Yang: Balancing and Answering Binary Visual
Questions}",
year = 2015
}
@ARTICLE{goyal-2016-makin-v-vqa-matter,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161200837G",
archivePrefix= "arXiv",
author = "{Goyal}, Y. and {Khot}, T. and {Summers-Stay}, D. and
{Batra}, D. and {Parikh}, D.",
eprint = "1612.00837",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Artificial Intelligence, Computer Science
- Computation and Language, Computer Science - Learning",
month = dec,
primaryClass = "cs.CV",
title = "{Making the V in Vqa Matter: Elevating the Role of Image
Understanding in Visual Question Answering}",
year = 2016
}
@ARTICLE{bowman-2015-gener-senten,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106349B",
archivePrefix= "arXiv",
author = "{Bowman}, S.~R. and {Vilnis}, L. and {Vinyals}, O. and {Dai},
A.~M. and {Jozefowicz}, R. and {Bengio}, S.",
eprint = "1511.06349",
journal = "ArXiv e-prints",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = nov,
primaryClass = "cs.LG",
title = "{Generating Sentences From a Continuous Space}",
year = 2015
}
@inproceedings{maas-2011-learning-word,
title = "Learning word vectors for sentiment analysis",
author = "Maas, Andrew L and Daly, Raymond E and Pham, Peter T and
Huang, Dan and Ng, Andrew Y and Potts, Christopher",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies-Volume
1",
pages = "142-150",
year = 2011,
organization = "Association for Computational Linguistics"
}
@inproceedings{ganitkevitch-2013-ppdb,
title = "{PPDB}: The Paraphrase Database",
author = "Ganitkevitch, Juri and {Van Durme}, Benjamin and
Callison-Burch, Chris",
booktitle = "Proceedings of NAACL-HLT",
pages = "758-764",
month = "June",
year = 2013,
address = "Atlanta, Georgia",
publisher = "Association for Computational Linguistics",
url = "http://cs.jhu.edu/~ccb/publications/ppdb.pdf"
}
@ARTICLE{mrk-2016-count-fittin,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160300892M",
archivePrefix= "arXiv",
author = "{Mrk{\v s}i{\'c}}, N. and {S{\'e}aghdha}, D.~{\'O} and
{Thomson}, B. and {Ga{\v s}i{\'c}}, M. and {Rojas-Barahona},
L. and {Su}, P.-H. and {Vandyke}, D. and {Wen}, T.-H. and
{Young}, S.",
eprint = "1603.00892",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = mar,
primaryClass = "cs.CL",
title = "{Counter-Fitting Word Vectors To Linguistic Constraints}",
year = 2016
}
@ARTICLE{hill-2014-simlex,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1408.3456H",
archivePrefix= "arXiv",
author = "{Hill}, F. and {Reichart}, R. and {Korhonen}, A.",
eprint = "1408.3456",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL",
title = "{SimLex-999: Evaluating Semantic Models With (Genuine)
Similarity Estimation}",
year = 2014
}
@inproceedings{agirre-2009-study-similarity,
title = "A study on similarity and relatedness using distributional
and wordnet-based approaches",
author = "Agirre, Eneko and Alfonseca, Enrique and Hall, Keith and
Kravalova, Jana and Pa{\c{s}}ca, Marius and Soroa, Aitor",
booktitle = "Proceedings of Human Language Technologies: The 2009 Annual
Conference of the North American Chapter of the Association
for Computational Linguistics",
pages = "19-27",
year = 2009,
organization = "Association for Computational Linguistics"
}
@article{marelli-2014-sick-cure,
title = {A SICK cure for the evaluation of compositional distributional
semantic models},
author = {Marelli, M and Menini, S and Baroni, M and Bentivogli, L and
Bernardi, R and Zamparelli, R},
year = 2014,
publisher = {Citeseer},
journal = ""
}
@inproceedings{severyn-2015-learning-rank,
title = "Learning to rank short text pairs with convolutional deep
neural networks",
author = "Severyn, Aliaksei and Moschitti, Alessandro",
booktitle = "Proceedings of the 38th International ACM SIGIR Conference on
Research and Development in Information Retrieval",
pages = "373-382",
year = 2015,
organization = "ACM"
}
@inproceedings{huang-2016-supervised-word,
Author = "Huang, Gao and Guo, Chuan and Kusner, Matt J and Sun, Yu and
Sha, Fei and Weinberger, Kilian Q",
Booktitle = "Advances in Neural Information Processing Systems 29",
Editor = "D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and
R. Garnett",
Pages = "4862-4870",
Publisher = "Curran Associates, Inc.",
Title = "Supervised Word Mover\textquotesingle s Distance",
Url =
"http://papers.nips.cc/paper/6139-supervised-word-movers-distance.pdf",
Year = 2016,
Bdsk-Url-1 =
"http://papers.nips.cc/paper/6139-supervised-word-movers-distance.pdf"
}
@ARTICLE{sennrich-2015-neural-machin,
author = "{Sennrich}, R. and {Haddow}, B. and {Birch}, A.",
title = "{Neural Machine Translation of Rare Words With Subword
Units}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150807909S",
archivePrefix= "arXiv",
eprint = "1508.07909",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL"
}
@ARTICLE{ling-2015-findin-funct-form,
author = "{Ling}, W. and {Lu{\'{\i}}s}, T. and {Marujo}, L. and
{Fernandez Astudillo}, R. and {Amir}, S. and {Dyer}, C. and
{Black}, A.~W. and {Trancoso}, I.",
title = "{Finding Function in Form: Compositional Character Models for
Open Vocabulary Word Representation}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150802096L",
archivePrefix= "arXiv",
eprint = "1508.02096",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL"
}
@inproceedings{kim-2016-char-aware,
title = "Character-Aware Neural Language Models.",
author = "Kim, Yoon and Jernite, Yacine and Sontag, David and Rush,
Alexander M",
booktitle = "AAAI",
pages = "2741-2749",
year = 2016
}
@article{achananuparp-2008-evaluation-sentence,
title = "The evaluation of sentence similarity measures",
author = "Achananuparp, Palakorn and Hu, Xiaohua and Shen, Xiajiong",
journal = "Data warehousing and knowledge discovery",
pages = "305-316",
year = 2008,
publisher = "Springer"
}
@ARTICLE{bradbury-2016-quasi-recur-neural-networ,
author = "{Bradbury}, J. and {Merity}, S. and {Xiong}, C. and {Socher},
R.",
title = "{Quasi-Recurrent Neural Networks}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161101576B",
archivePrefix= "arXiv",
eprint = "1611.01576",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Artificial Intelligence, Computer Science
- Computation and Language, Computer Science - Learning",
month = nov
}
@ARTICLE{ballesteros-2015-improv-trans,
author = "{Ballesteros}, M. and {Dyer}, C. and {Smith}, N.~A.",
title = "{Improved Transition-Based Parsing By Modeling Characters
Instead of Words With LSTMs}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150800657B",
archivePrefix= "arXiv",
eprint = "1508.00657",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL"
}
@ARTICLE{wiseman-2016-sequen-to,
author = "{Wiseman}, S. and {Rush}, A.~M.",
title = "{Sequence-To-Sequence Learning As Beam-Search Optimization}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160602960W",
archivePrefix= "arXiv",
eprint = "1606.02960",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing, Statistics
- Machine Learning",
month = jun,
primaryClass = "cs.CL"
}
@ARTICLE{snoek-2012-pract-bayes,
author = "{Snoek}, J. and {Larochelle}, H. and {Adams}, R.~P.",
title = "{Practical Bayesian Optimization of Machine Learning
Algorithms}",
journal = "ArXiv e-prints",
year = 2012,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2012arXiv1206.2944S",
archivePrefix= "arXiv",
eprint = "1206.2944",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = jun,
primaryClass = "stat.ML"
}
@article{hashimoto-2016-word-embed,
title = "Word embeddings as metric recovery in semantic spaces",
author = "Hashimoto, Tatsunori B and Alvarez-Melis, David and Jaakkola,
Tommi S",
journal = "Transactions of the Association for Computational
Linguistics",
volume = 4,
pages = "273-286",
year = 2016
}
@inproceedings{mnih-2007-three-graph,
title = "Three new graphical models for statistical language
modelling",
author = "Mnih, Andriy and Hinton, Geoffrey",
booktitle = "Proceedings of the 24th international conference on Machine
learning",
pages = "641-648",
year = 2007,
organization = "ACM"
}
@ARTICLE{chawla-2011-smote,
author = "{Chawla}, N.~V. and {Bowyer}, K.~W. and {Hall}, L.~O. and
{Kegelmeyer}, W.~P.",
title = "{SMOTE: Synthetic Minority Over-Sampling Technique}",
journal = "ArXiv e-prints",
year = 2011,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2011arXiv1106.1813C",
archivePrefix= "arXiv",
eprint = "1106.1813",
keywords = "Computer Science - Artificial Intelligence",
month = jun,
primaryClass = "cs.AI"
}
@inproceedings{klein-2001-parsing-treebank,
title = "Parsing with treebank grammars: Empirical bounds, theoretical
models, and the structure of the Penn treebank",
author = "Klein, Dan and Manning, Christopher D",
booktitle = "Proceedings of the 39th Annual Meeting on Association for
Computational Linguistics",
pages = "338-345",
year = 2001,
organization = "Association for Computational Linguistics"
}
@article{collins-2003-head-driven,
title = "Head-driven statistical models for natural language parsing",
author = "Collins, Michael",
journal = "Computational linguistics",
volume = 29,
number = 4,
pages = "589-637",
year = 2003,
publisher = "MIT Press"
}
@inproceedings{collins-1997-three-generative,
title = "Three generative, lexicalised models for statistical parsing",
author = "Collins, Michael",
booktitle = "Proceedings of the eighth conference on European chapter of
the Association for Computational Linguistics",
pages = "16-23",
year = 1997,
organization = "Association for Computational Linguistics"
}
@inproceedings{bikel-2004-distributional-analysis,
title = "A Distributional Analysis of a Lexicalized Statistical
Parsing Mode.",
author = "Bikel, Daniel M",
booktitle = "EMNLP",
pages = "182-189",
year = 2004
}
@inproceedings{chen-2014-fast-acc,
title = "A fast and accurate dependency parser using neural networks",
author = "Chen, Danqi and Manning, Christopher",
booktitle = "Proceedings of the 2014 conference on empirical methods in
natural language processing (EMNLP)",
pages = "740-750",
year = 2014
}
@inproceedings{socher-2013-su-rnn,
title = "Parsing with compositional vector grammars",
author = "Socher, Richard and Bauer, John and Manning, Christopher D
and others",
booktitle = "Proceedings of the 51st Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "455-465",
year = 2013
}
@inproceedings{levy-2003-harder-parse,
title = "Is it harder to parse Chinese, or the Chinese Treebank?",
author = "Levy, Roger and Manning, Christopher",
booktitle = "Proceedings of the 41st Annual Meeting on Association for
Computational Linguistics-Volume 1",
pages = "439-446",
year = 2003,
organization = "Association for Computational Linguistics"
}
@inproceedings{chang-2009-discriminative-reorder,
title = "Discriminative reordering with Chinese grammatical relations
features",
author = "Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and
Manning, Christopher D",
booktitle = "Proceedings of the Third Workshop on Syntax and Structure in
Statistical Translation",
pages = "51-59",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{zhu-2013-fast-acc,
title = "Fast and Accurate Shift-Reduce Constituent Parsing.",
author = "Zhu, Muhua and Zhang, Yue and Chen, Wenliang and Zhang, Min
and Zhu, Jingbo",
booktitle = "ACL (1)",
pages = "434-443",
year = 2013
}
@inproceedings{klein-2003-accurate-unlex,
title = "Accurate unlexicalized parsing",
author = "Klein, Dan and Manning, Christopher D",
booktitle = "Proceedings of the 41st Annual Meeting on Association for
Computational Linguistics-Volume 1",
pages = "423-430",
year = 2003,
organization = "Association for Computational Linguistics"
}
@inproceedings{klein-2003-fast-exact,
title = "Fast exact inference with a factored model for natural
language parsing",
author = "Klein, Dan and Manning, Christopher D",
booktitle = "Advances in neural information processing systems",
pages = "3-10",
year = 2003
}
@inproceedings{nivre-2016-universal-depend,
title = "Universal Dependencies v1: A Multilingual Treebank
Collection.",
author = "Nivre, Joakim and de Marneffe, Marie-Catherine and Ginter,
Filip and Goldberg, Yoav and Hajic, Jan and Manning,
Christopher D and McDonald, Ryan T and Petrov, Slav and
Pyysalo, Sampo and Silveira, Natalia and others",
booktitle = "LREC",
year = 2016
}
@inproceedings{de-2006-generating-typed,
title = "Generating typed dependency parses from phrase structure
parses",
author = "De Marneffe, Marie-Catherine and MacCartney, Bill and
Manning, Christopher D and others",
booktitle = "Proceedings of LREC",
volume = 6,
number = 2006,
pages = "449-454",
year = 2006,
organization = "Genoa Italy"
}
@ARTICLE{Krotov-1999-compact-penn,
author = "{Krotov}, A. and {Hepple}, M. and {Gaizauskas}, R. and
{Wilks}, Y.",
title = "{Compacting the Penn Treebank Grammar}",
journal = "eprint arXiv:cs/9902001",
eprint = "cs/9902001",
keywords = "Computer Science - Computation and Language, I.2.7",
year = 1999,
month = jan,
adsurl = "http://adsabs.harvard.edu/abs/1999cs........2001K",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{toutanova-2000-enriching-knowledge,
title = "Enriching the knowledge sources used in a maximum entropy
part-of-speech tagger",
author = "Toutanova, Kristina and Manning, Christopher D",
booktitle = "Proceedings of the 2000 Joint SIGDAT conference on Empirical
methods in natural language processing and very large
corpora: held in conjunction with the 38th Annual Meeting of
the Association for Computational Linguistics-Volume 13",
pages = "63-70",
year = 2000,
organization = "Association for Computational Linguistics"
}
@inproceedings{toutanova-2003-feature-rich,
title = "Feature-rich part-of-speech tagging with a cyclic dependency
network",
author = "Toutanova, Kristina and Klein, Dan and Manning, Christopher D
and Singer, Yoram",
booktitle = "Proceedings of the 2003 Conference of the North American
Chapter of the Association for Computational Linguistics on
Human Language Technology-Volume 1",
pages = "173-180",
year = 2003,
organization = "Association for Computational Linguistics"
}
@ARTICLE{chen-2016-thoroug-examin,
author = "{Chen}, D. and {Bolton}, J. and {Manning}, C.~D.",
title = "{A Thorough Examination of the Cnn/daily Mail Reading
Comprehension Task}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160602858C",
archivePrefix= "arXiv",
eprint = "1606.02858",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence",
month = jun,
primaryClass = "cs.CL"
}
@ARTICLE{dhingra-2016-gated-atten,
author = "{Dhingra}, B. and {Liu}, H. and {Yang}, Z. and {Cohen},
W.~W. and {Salakhutdinov}, R.",
title = "{Gated-Attention Readers for Text Comprehension}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160601549D",
archivePrefix= "arXiv",
eprint = "1606.01549",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jun,
primaryClass = "cs.CL"
}
@ARTICLE{kadlec-2016-text-under,
author = "{Kadlec}, R. and {Schmid}, M. and {Bajgar}, O. and
{Kleindienst}, J.",
title = "{Text Understanding With the Attention Sum Reader Network}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160301547K",
archivePrefix= "arXiv",
eprint = "1603.01547",
keywords = "Computer Science - Computation and Language",
month = mar,
primaryClass = "cs.CL"
}
@ARTICLE{tseng-2016-towar-machin,
author = "{Tseng}, B.-H. and {Shen}, S.-S. and {Lee}, H.-Y. and {Lee},
L.-S.",
title = "{Towards Machine Comprehension of Spoken Content: Initial
Toefl Listening Comprehension Test By Machine}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160806378T",
archivePrefix= "arXiv",
eprint = "1608.06378",
keywords = "Computer Science - Computation and Language",
month = aug,
primaryClass = "cs.CL"
}
@ARTICLE{cui-2016-consen-atten,
author = "{Cui}, Y. and {Liu}, T. and {Chen}, Z. and {Wang}, S. and
{Hu}, G.",
title = "{Consensus Attention-Based Neural Networks for Chinese
Reading Comprehension}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160702250C",
archivePrefix= "arXiv",
eprint = "1607.02250",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = jul,
primaryClass = "cs.CL"
}
@ARTICLE{cui-2016-atten-over,
author = "{Cui}, Y. and {Chen}, Z. and {Wei}, S. and {Wang}, S. and
{Liu}, T. and {Hu}, G.",
title = "{Attention-Over-Attention Neural Networks for Reading
Comprehension}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160704423C",
archivePrefix= "arXiv",
eprint = "1607.04423",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = jul,
primaryClass = "cs.CL"
}
@ARTICLE{wang-2016-machin-compr,
author = "{Wang}, S. and {Jiang}, J.",
title = "{Machine Comprehension Using Match-Lstm and Answer Pointer}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160807905W",
archivePrefix= "arXiv",
eprint = "1608.07905",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence",
month = aug,
primaryClass = "cs.CL"
}
@ARTICLE{sordoni-2016-iterat-alter,
author = "{Sordoni}, A. and {Bachman}, P. and {Trischler}, A. and
{Bengio}, Y.",
title = "{Iterative Alternating Neural Attention for Machine Reading}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160602245S",
archivePrefix= "arXiv",
eprint = "1606.02245",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = jun,
primaryClass = "cs.CL"
}
@inproceedings{kobayashi-2016-dynamic-entity,
title = "Dynamic Entity Representation with Max-pooling Improves
Machine Reading",
author = "Kobayashi, Sosuke and Tian, Ran and Okazaki, Naoaki and Inui,
Kentaro",
booktitle = "Proceedings of NAACL-HLT",
pages = "850-855",
year = 2016
}
@ARTICLE{trischler-2016-natur-languag,
author = "{Trischler}, A. and {Ye}, Z. and {Yuan}, X. and {Suleman},
K.",
title = "{Natural Language Comprehension With the EpiReader}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160602270T",
archivePrefix= "arXiv",
eprint = "1606.02270",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL"
}
@ARTICLE{creswell-2017-gener-adver-networ,
author = "{Creswell}, A. and {White}, T. and {Dumoulin}, V. and
{Arulkumaran}, K. and {Sengupta}, B. and {Bharath}, A.~A",
title = "{Generative Adversarial Networks: An Overview}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171007035C",
archivePrefix= "arXiv",
eprint = "1710.07035",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = oct,
primaryClass = "cs.CV"
}
@ARTICLE{weston-2014-memor-networ,
author = "{Weston}, J. and {Chopra}, S. and {Bordes}, A.",
title = "{Memory Networks}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1410.3916W",
archivePrefix= "arXiv",
eprint = "1410.3916",
keywords = "Computer Science - Artificial Intelligence, Computer Science
- Computation and Language, Statistics - Machine Learning",
month = oct,
primaryClass = "cs.AI"
}
@ARTICLE{munkhdalai-2016-neural-seman-encod,
author = "{Munkhdalai}, T. and {Yu}, H.",
title = "{Neural Semantic Encoders}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160704315M",
archivePrefix= "arXiv",
eprint = "1607.04315",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Statistics - Machine Learning",
month = jul,
primaryClass = "cs.LG"
}
@ARTICLE{nickel-2017-poinc-embed,
author = "{Nickel}, M. and {Kiela}, D.",
title = "{Poincar$\backslash$'e Embeddings for Learning Hierarchical
Representations}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170508039N",
archivePrefix= "arXiv",
eprint = "1705.08039",
keywords = "Computer Science - Artificial Intelligence, Computer Science
- Learning, Statistics - Machine Learning",
month = may,
primaryClass = "cs.AI"
}
@ARTICLE{weston-2015-towar-ai,
author = "{Weston}, J. and {Bordes}, A. and {Chopra}, S. and {Rush},
A.~M. and {van Merri{\"e}nboer}, B. and {Joulin}, A. and
{Mikolov}, T.",
title = "{Towards Ai-Complete Question Answering: A Set of
Prerequisite Toy Tasks}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150205698W",
archivePrefix= "arXiv",
eprint = "1502.05698",
keywords = "Computer Science - Artificial Intelligence, Computer Science
- Computation and Language, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.AI"
}
@ARTICLE{sabour-2017-dynam-routin-between-capsul,
author = "{Sabour}, S. and {Frosst}, N. and {E Hinton}, G.",
title = "{Dynamic Routing Between Capsules}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171009829S",
archivePrefix= "arXiv",
eprint = "1710.09829",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
month = oct,
primaryClass = "cs.CV"
}
@ARTICLE{lu-2017-depth-creat,
author = "{Lu}, H. and {Kawaguchi}, K.",
title = "{Depth Creates No Bad Local Minima}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170208580L",
archivePrefix= "arXiv",
eprint = "1702.08580",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing, Mathematics - Optimization and
Control, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.LG"
}
@ARTICLE{kawaguchi-2017-gener-deep-learn,
author = "{Kawaguchi}, K. and {Pack Kaelbling}, L. and {Bengio}, Y.",
title = "{Generalization in Deep Learning}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171005468K",
archivePrefix= "arXiv",
eprint = "1710.05468",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing",
month = oct,
primaryClass = "stat.ML"
}
@article{wolpert-1997-no-free-lunch,
title = "No free lunch theorems for optimization",
author = "Wolpert, David H and Macready, William G",
journal = "IEEE transactions on evolutionary computation",
volume = 1,
number = 1,
pages = "67-82",
year = 1997,
publisher = "IEEE"
}
@inproceedings{recasens-2013-life-death,
title = "The Life and Death of Discourse Entities: Identifying
Singleton Mentions.",
author = "Recasens, Marta and de Marneffe, Marie-Catherine and Potts,
Christopher",
year = 2013,
booktitle = "''"
}
@inproceedings{lee-2011-stanford-multi-pass,
title = "Stanford's multi-pass sieve coreference resolution system at
the CoNLL-2011 shared task",
author = "Lee, Heeyoung and Peirsman, Yves and Chang, Angel and
Chambers, Nathanael and Surdeanu, Mihai and Jurafsky, Dan",
booktitle = "Proceedings of the fifteenth conference on computational
natural language learning: Shared task",
pages = "28-34",
year = 2011,
organization = "Association for Computational Linguistics"
}
@inproceedings{raghunathan-2010-multi-pass-sieve,
title = "A multi-pass sieve for coreference resolution",
author = "Raghunathan, Karthik and Lee, Heeyoung and Rangarajan,
Sudarshan and Chambers, Nathanael and Surdeanu, Mihai and
Jurafsky, Dan and Manning, Christopher",
booktitle = "Proceedings of the 2010 Conference on Empirical Methods in
Natural Language Processing",
pages = "492-501",
year = 2010,
organization = "Association for Computational Linguistics"
}
@article{lee-2013-deterministic-coreference,
title = "Deterministic coreference resolution based on entity-centric,
precision-ranked rules",
author = "Lee, Heeyoung and Chang, Angel and Peirsman, Yves and
Chambers, Nathanael and Surdeanu, Mihai and Jurafsky, Dan",
journal = "Computational Linguistics",
volume = 39,
number = 4,
pages = "885-916",
year = 2013,
publisher = "MIT Press"
}
@inproceedings{clark-2015-entity-centric,
title = "Entity-Centric Coreference Resolution with Model Stacking.",
author = "Clark, Kevin and Manning, Christopher D",
booktitle = "ACL (1)",
pages = "1405-1415",
year = 2015
}
@ARTICLE{clark-2016-rl-for-cr,
author = "{Clark}, K. and {Manning}, C.~D.",
title = "{Deep Reinforcement Learning for Mention-Ranking Coreference
Models}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160908667C",
archivePrefix= "arXiv",
eprint = "1609.08667",
keywords = "Computer Science - Computation and Language",
month = sep,
primaryClass = "cs.CL"
}
@ARTICLE{clark-2016-improv-coref,
author = "{Clark}, K. and {Manning}, C.~D.",
title = "{Improving Coreference Resolution By Learning Entity-Level
Distributed Representations}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160601323C",
archivePrefix= "arXiv",
eprint = "1606.01323",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL"
}
@InProceedings{recasens-2013-same-referent,
author = "Recasens, Marta and Can, Matthew and Jurafsky, Daniel",
title = "Same Referent, Different Words: Unsupervised Mining of Opaque
Coreferent Mentions",
booktitle = "Proceedings of the 2013 Conference of the North American
Chapter of the Association for Computational Linguistics:
Human Language Technologies",
year = 2013,
publisher = "Association for Computational Linguistics",
pages = "897-906",
location = "Atlanta, Georgia",
url =
"http://aclanthology.coli.uni-saarland.de/pdf/N/N13/N13-1110.pdf"
}
@inproceedings{lee-2012-joint-entity,
title = "Joint entity and event coreference resolution across
documents",
author = "Lee, Heeyoung and Recasens, Marta and Chang, Angel and
Surdeanu, Mihai and Jurafsky, Dan",
booktitle = "Proceedings of the 2012 Joint Conference on Empirical Methods
in Natural Language Processing and Computational Natural
Language Learning",
pages = "489-500",
year = 2012,
organization = "Association for Computational Linguistics"
}
@ARTICLE{lee-2017-end-to,
author = "{Lee}, K. and {He}, L. and {Lewis}, M. and {Zettlemoyer}, L.",
title = "{End-To-End Neural Coreference Resolution}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170707045L",
archivePrefix= "arXiv",
eprint = "1707.07045",
keywords = "Computer Science - Computation and Language",
month = jul,
primaryClass = "cs.CL"
}
@ARTICLE{radford-2017-learn-to,
author = "{Radford}, A. and {Jozefowicz}, R. and {Sutskever}, I.",
title = "{Learning To Generate Reviews and Discovering Sentiment}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170401444R",
archivePrefix= "arXiv",
eprint = "1704.01444",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Computer Science - Neural and Evolutionary
Computing",
month = apr,
primaryClass = "cs.LG"
}
@ARTICLE{felbo-2017-using-million,
author = "{Felbo}, B. and {Mislove}, A. and {S{\o}gaard}, A. and
{Rahwan}, I. and {Lehmann}, S.",
title = "{Using Millions of Emoji Occurrences To Learn Any-Domain
Representations for Detecting Sentiment, Emotion and
sarcasm}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170800524F",
archivePrefix= "arXiv",
eprint = "1708.00524",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = aug,
primaryClass = "stat.ML"
}
@PHDTHESIS{hamdan-2016-under-coupl,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016PhDT........44H",
author = "{Hamdan}, L.",
school = "West Virginia University",
title = "{Understanding Coupling of Global and Diffuse Solar Radiation
with Climatic Variability}",
year = 2016
}
@ARTICLE{zhang-2017-which-encod,
author = "{Zhang}, X. and {LeCun}, Y.",
title = "{Which Encoding Is the Best for Text Classification in
Chinese, English, Japanese and Korean?}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170802657Z",
archivePrefix= "arXiv",
eprint = "1708.02657",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = aug,
primaryClass = "cs.CL"
}
@article{liu-2012-sentiment-analysis,
title = "Sentiment analysis and opinion mining",
author = "Liu, Bing",
journal = "Synthesis lectures on human language technologies",
volume = 5,
number = 1,
pages = "1-167",
year = 2012,
publisher = "Morgan \& Claypool Publishers"
}
@article{pang-2008-opinion-mining,
title = "Opinion mining and sentiment analysis",
author = "Pang, Bo and Lee, Lillian and others",
journal = "Foundations and Trends{\textregistered} in Information
Retrieval",
volume = 2,
number = "1--2",
pages = "1-135",
year = 2008,
publisher = "Now Publishers, Inc."
}
@ARTICLE{rajpurkar-2016-squad,
author = "{Rajpurkar}, P. and {Zhang}, J. and {Lopyrev}, K. and
{Liang}, P.",
title = "{SQuAD: 100,000+ Questions for Machine Comprehension of
Text}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160605250R",
archivePrefix= "arXiv",
eprint = "1606.05250",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL"
}
@ARTICLE{miwa-2016-end-to,
author = "{Miwa}, M. and {Bansal}, M.",
title = "{End-To-End Relation Extraction Using Lstms on Sequences and
Tree Structures}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160100770M",
archivePrefix= "arXiv",
eprint = "1601.00770",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = jan,
primaryClass = "cs.CL"
}
@ARTICLE{kumar-2017-survey-deep,
author = "{Kumar}, S.",
title = "{A Survey of Deep Learning Methods for Relation Extraction}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170503645K",
archivePrefix= "arXiv",
eprint = "1705.03645",
keywords = "Computer Science - Computation and Language",
month = may,
primaryClass = "cs.CL"
}
@inproceedings{lin-2016-neural-relation,
title = "Neural Relation Extraction with Selective Attention over
Instances.",
author = "Lin, Yankai and Shen, Shiqi and Liu, Zhiyuan and Luan, Huanbo
and Sun, Maosong",
booktitle = "ACL (1)",
year = 2016
}
@inproceedings{wu-2017-adversarial-train,
title = "Adversarial Training for Relation Extraction",
author = "Wu, Yi and Bamman, David and Russell, Stuart",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
pages = "1779-1784",
year = 2017
}
@ARTICLE{lei-2017-train-rnns,
author = "{Lei}, T. and {Zhang}, Y.",
title = "{Training Rnns As Fast As CNNs}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170902755L",
archivePrefix= "arXiv",
eprint = "1709.02755",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing",
month = sep,
primaryClass = "cs.CL"
}
@ARTICLE{rocktaeschel-2015-reason-about,
author = "{Rockt{\"a}schel}, T. and {Grefenstette}, E. and {Hermann},
K.~M. and {Ko{\v c}isk{\'y}}, T. and {Blunsom}, P.",
title = "{Reasoning About Entailment With Neural Attention}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150906664R",
archivePrefix= "arXiv",
eprint = "1509.06664",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing, 68T50, I.2.6, I.2.7",
month = sep,
primaryClass = "cs.CL"
}
@inproceedings{bowman-2015-large-annotated,
Author = "Bowman, Samuel R. and Angeli, Gabor and Potts, Christopher,
and Manning, Christopher D.",
Booktitle = "Proceedings of the 2015 Conference on Empirical Methods in
Natural Language Processing (EMNLP)",
Publisher = "Association for Computational Linguistics",
Title = "A large annotated corpus for learning natural language
inference",
Year = 2015
}
@ARTICLE{zolna-2017-fraternal-dropout,
author = "{Zolna}, K. and {Arpit}, D. and {Suhubdy}, D. and {Bengio},
Y.",
title = "{Fraternal Dropout}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171100066Z",
archivePrefix= "arXiv",
eprint = "1711.00066",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Learning",
month = oct,
primaryClass = "stat.ML"
}
@ARTICLE{vinyals-2015-order-matter,
author = "{Vinyals}, O. and {Bengio}, S. and {Kudlur}, M.",
title = "{Order Matters: Sequence To Sequence for sets}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151106391V",
archivePrefix= "arXiv",
eprint = "1511.06391",
keywords = "Statistics - Machine Learning, Computer Science - Computation
and Language, Computer Science - Learning",
month = nov,
primaryClass = "stat.ML"
}
@ARTICLE{clauset-2007-power-law,
author = "{Clauset}, A. and {Rohilla Shalizi}, C. and {Newman},
M.~E.~J.",
title = "{Power-Law Distributions in Empirical data}",
journal = "ArXiv e-prints",
year = 2007,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2007arXiv0706.1062C",
archivePrefix= "arXiv",
eprint = "0706.1062",
keywords = "Physics - Data Analysis, Statistics and Probability,
Condensed Matter - Disordered Systems and Neural Networks,
Statistics - Applications, Statistics - Methodology",
month = jun,
primaryClass = "physics.data-an"
}
@ARTICLE{gu-2016-incor-copyin,
author = "{Gu}, J. and {Lu}, Z. and {Li}, H. and {Li}, V.~O.~K.",
title = "{Incorporating Copying Mechanism in Sequence-To-Sequence
Learning}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160306393G",
archivePrefix= "arXiv",
eprint = "1603.06393",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing",
month = mar,
primaryClass = "cs.CL"
}
@ARTICLE{see-2017-get-to-point,
author = "{See}, A. and {Liu}, P.~J. and {Manning}, C.~D.",
title = "{Get To The Point: Summarization With Pointer-Generator
Networks}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170404368S",
archivePrefix= "arXiv",
eprint = "1704.04368",
keywords = "Computer Science - Computation and Language",
month = apr,
primaryClass = "cs.CL"
}
@inproceedings{he-2017-generating-natural,
title = "Generating natural answers by incorporating copying and
retrieving mechanisms in sequence-to-sequence learning",
author = "He, Shizhu and Liu, Cao and Liu, Kang and Zhao, Jun",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "199-208",
year = 2017
}
@ARTICLE{wang-2015-learn-natur,
author = "{Wang}, S. and {Jiang}, J.",
title = "{Learning Natural Language Inference With LSTM}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151208849W",
archivePrefix= "arXiv",
eprint = "1512.08849",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Neural and Evolutionary
Computing",
month = dec,
primaryClass = "cs.CL"
}
@ARTICLE{yu-2016-seqgan,
author = "{Yu}, L. and {Zhang}, W. and {Wang}, J. and {Yu}, Y.",
title = "{SeqGAN: Sequence Generative Adversarial Nets With Policy
Gradient}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160905473Y",
archivePrefix= "arXiv",
eprint = "1609.05473",
keywords = "Computer Science - Learning, Computer Science - Artificial
Intelligence",
month = sep,
primaryClass = "cs.LG"
}
@ARTICLE{gulrajani-2017-improv-train-wasser-gans,
author = "{Gulrajani}, I. and {Ahmed}, F. and {Arjovsky}, M. and
{Dumoulin}, V. and {Courville}, A.",
title = "{Improved Training of Wasserstein GANs}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170400028G",
archivePrefix= "arXiv",
eprint = "1704.00028",
keywords = "Computer Science - Learning, Statistics - Machine Learning",
month = mar,
primaryClass = "cs.LG"
}
@ARTICLE{dauphin-2016-languag-model,
author = "{Dauphin}, Y.~N. and {Fan}, A. and {Auli}, M. and {Grangier},
D.",
title = "{Language Modeling With Gated Convolutional Networks}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161208083D",
archivePrefix= "arXiv",
eprint = "1612.08083",
keywords = "Computer Science - Computation and Language",
month = dec,
primaryClass = "cs.CL"
}
@ARTICLE{kuchaiev-2017-factor-trick-lstm,
author = "{Kuchaiev}, O. and {Ginsburg}, B.",
title = "{Factorization Tricks for Lstm networks}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170310722K",
archivePrefix= "arXiv",
eprint = "1703.10722",
keywords = "Computer Science - Computation and Language, Computer Science
- Neural and Evolutionary Computing, Statistics - Machine Learning",
month = mar,
primaryClass = "cs.CL"
}
@ARTICLE{artetxe-2017-unsup-neural-machin-trans,
author = "{Artetxe}, M. and {Labaka}, G. and {Agirre}, E. and {Cho},
K.",
title = "{Unsupervised Neural Machine Translation}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171011041A",
archivePrefix= "arXiv",
eprint = "1710.11041",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Learning",
month = oct,
primaryClass = "cs.CL"
}
@inproceedings{artetxe-2016-learning-principled,
author = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
title = "Learning principled bilingual mappings of word embeddings
while preserving monolingual invariance",
booktitle = "Proceedings of the 2016 Conference on Empirical Methods in
Natural Language Processing",
year = 2016,
pages = "2289-2294"
}
@inproceedings{artetxe-2017-learning-bilingual,
author = "Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko",
title = "Learning bilingual word embeddings with (almost) no bilingual
data",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
year = 2017,
pages = "451-462"
}
@ARTICLE{sagun-2016-eigen-hessian-deep-learn,
author = "{Sagun}, L. and {Bottou}, L. and {LeCun}, Y.",
title = "{Eigenvalues of the Hessian in Deep Learning: Singularity and
Beyond}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161107476S",
archivePrefix= "arXiv",
eprint = "1611.07476",
keywords = "Computer Science - Learning",
month = nov,
primaryClass = "cs.LG"
}
@ARTICLE{zhou-2017-incep-score,
author = "{Zhou}, Z. and {Zhang}, W. and {Wang}, J.",
title = "{Inception Score, Label Smoothing, Gradient Vanishing and
-log(D(x)) Alternative}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170801729Z",
archivePrefix= "arXiv",
eprint = "1708.01729",
keywords = "Computer Science - Learning, Computer Science - Artificial
Intelligence, Computer Science - Computer Vision and Pattern
Recognition, Statistics - Machine Learning",
month = aug,
primaryClass = "cs.LG"
}
@ARTICLE{dauphin-2014-ident-attac,
author = "{Dauphin}, Y. and {Pascanu}, R. and {Gulcehre}, C. and {Cho},
K. and {Ganguli}, S. and {Bengio}, Y.",
title = "{Identifying and Attacking the Saddle Point Problem in
High-Dimensional Non-Convex optimization}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1406.2572D",
archivePrefix= "arXiv",
eprint = "1406.2572",
keywords = "Computer Science - Learning, Mathematics - Optimization and
Control, Statistics - Machine Learning",
month = jun,
primaryClass = "cs.LG"
}
@ARTICLE{bottou-2016-optim-method,
author = "{Bottou}, L. and {Curtis}, F.~E. and {Nocedal}, J.",
title = "{Optimization Methods for Large-Scale Machine Learning}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160604838B",
archivePrefix= "arXiv",
eprint = "1606.04838",
keywords = "Statistics - Machine Learning, Computer Science - Learning,
Mathematics - Optimization and Control",
month = jun,
primaryClass = "stat.ML"
}
@ARTICLE{berahas-2016-multi-batch,
author = "{Berahas}, A.~S. and {Nocedal}, J. and {Tak{\'a}{\v c}}, M.",
title = "{A Multi-Batch L-Bfgs Method for Machine Learning}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160506049B",
archivePrefix= "arXiv",
eprint = "1605.06049",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning, Statistics - Machine Learning",
month = may,
primaryClass = "math.OC"
}
@phdthesis{martens-2016-second-order,
title = "Second-order optimization for neural networks",
author = "Martens, James",
year = 2016,
school = "University of Toronto (Canada)"
}
@ARTICLE{mahsereci-2015-probab-line,
author = "{Mahsereci}, M. and {Hennig}, P.",
title = "{Probabilistic Line Searches for Stochastic Optimization}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150202846M",
archivePrefix= "arXiv",
eprint = "1502.02846",
keywords = "Computer Science - Learning, Mathematics - Optimization and
Control, Statistics - Machine Learning",
month = feb,
primaryClass = "cs.LG"
}
@ARTICLE{tan-2016-barzilai-borwein,
author = "{Tan}, C. and {Ma}, S. and {Dai}, Y.-H. and {Qian}, Y.",
title = "{Barzilai-Borwein Step Size for Stochastic Gradient Descent}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1605.04131",
primaryClass = "math.OC",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning, Statistics - Machine Learning",
year = 2016,
month = may,
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160504131T",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{mass-2015-speed-learn,
author = "{Mass{\'e}}, P.-Y. and {Ollivier}, Y.",
title = "{Speed learning on the fly}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1511.02540",
primaryClass = "math.OC",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning, Statistics - Machine Learning",
year = 2015,
month = nov,
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151102540M",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{moritz-2015-linear-conver,
author = "{Moritz}, P. and {Nishihara}, R. and {Jordan}, M.~I.",
title = "{A Linearly-Convergent Stochastic L-Bfgs Algorithm}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150802087M",
archivePrefix= "arXiv",
eprint = "1508.02087",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning, Mathematics - Numerical Analysis, Statistics -
Computation, Statistics - Machine Learning",
month = aug,
primaryClass = "math.OC"
}
@ARTICLE{byrd-2014-stoch-quasi,
author = "{Byrd}, R.~H. and {Hansen}, S.~L. and {Nocedal}, J. and
{Singer}, Y.",
title = "{A Stochastic Quasi-Newton Method for Large-Scale
Optimization}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1401.7020B",
archivePrefix= "arXiv",
eprint = "1401.7020",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning, Statistics - Machine Learning",
month = jan,
primaryClass = "math.OC"
}
@article{pearlmutter-1994-fast-exact,
title = "Fast exact multiplication by the Hessian",
author = "Pearlmutter, Barak A",
journal = "Neural computation",
volume = 6,
number = 1,
pages = "147-160",
year = 1994,
publisher = "MIT Press"
}
@ARTICLE{agarwal-2016-second-order,
author = "{Agarwal}, N. and {Bullins}, B. and {Hazan}, E.",
title = "{Second Order Stochastic Optimization in Linear Time}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1602.03943",
primaryClass = "stat.ML",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
year = 2016,
month = feb,
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160203943A",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{pascanu-2014-saddl-point,
author = "{Pascanu}, R. and {Dauphin}, Y.~N. and {Ganguli}, S. and
{Bengio}, Y.",
title = "{On the Saddle Point Problem for Non-Convex optimization}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1405.4604P",
archivePrefix= "arXiv",
eprint = "1405.4604",
keywords = "Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = may,
primaryClass = "cs.LG"
}
@ARTICLE{looks-2017-deep-learn,
author = "{Looks}, M. and {Herreshoff}, M. and {Hutchins}, D. and
{Norvig}, P.",
title = "{Deep Learning With Dynamic Computation Graphs}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170202181L",
archivePrefix= "arXiv",
eprint = "1702.02181",
keywords = "Computer Science - Neural and Evolutionary Computing,
Computer Science - Learning, Statistics - Machine Learning",
month = feb
}
@ARTICLE{neubig-2017-fly-operat,
author = "{Neubig}, G. and {Goldberg}, Y. and {Dyer}, C.",
title = "{On-The-Fly Operation Batching in Dynamic Computation
Graphs}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170507860N",
archivePrefix= "arXiv",
eprint = "1705.07860",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language, Statistics - Machine Learning",
month = may,
primaryClass = "cs.LG"
}
@ARTICLE{klein-2017-openn,
author = "{Klein}, G. and {Kim}, Y. and {Deng}, Y. and {Senellart},
J. and {Rush}, A.~M.",
title = "{OpenNMT: Open-Source Toolkit for Neural Machine
Translation}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170102810K",
archivePrefix= "arXiv",
eprint = "1701.02810",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Neural and Evolutionary
Computing",
month = jan,
primaryClass = "cs.CL"
}
@inproceedings{gatys-2016-image-style,
title = "Image style transfer using convolutional neural networks",
author = "Gatys, Leon A and Ecker, Alexander S and Bethge, Matthias",
booktitle = "Proceedings of the IEEE Conference on Computer Vision and
Pattern Recognition",
pages = "2414-2423",
year = 2016
}
@ARTICLE{kingma-2014-adam,
author = "{Kingma}, D.~P. and {Ba}, J.",
title = "{Adam: A Method for Stochastic Optimization}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1412.6980K",
archivePrefix= "arXiv",
eprint = "1412.6980",
keywords = "Computer Science - Learning",
month = dec,
primaryClass = "cs.LG"
}
@ARTICLE{zhang-2016-under-deep,
author = "{Zhang}, C. and {Bengio}, S. and {Hardt}, M. and {Recht},
B. and {Vinyals}, O.",
title = "{Understanding Deep Learning Requires Rethinking
generalization}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161103530Z",
archivePrefix= "arXiv",
eprint = "1611.03530",
keywords = "Computer Science - Learning",
month = nov,
primaryClass = "cs.LG"
}
@article{duchi-2011-adaptive-subgrad,
title = "Adaptive subgradient methods for online learning and
stochastic optimization",
author = "Duchi, John and Hazan, Elad and Singer, Yoram",
journal = "Journal of Machine Learning Research",
volume = 12,
number = "Jul",
pages = "2121-2159",
year = 2011
}
@inproceedings{roth-2004-feature-selection,
title = "Feature selection in clustering problems",
author = "Roth, Volker and Lange, Tilman",
booktitle = "Advances in neural information processing systems",
pages = "473-480",
year = 2004
}
@ARTICLE{liu-2017-gener-adver,
author = "{Liu}, L. and {Lu}, Y. and {Yang}, M. and {Qu}, Q. and {Zhu},
J. and {Li}, H.",
title = "{Generative Adversarial Network for Abstractive Text
Summarization}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171109357L",
archivePrefix= "arXiv",
eprint = "1711.09357",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence",
month = nov,
primaryClass = "cs.CL"
}
@ARTICLE{moussallem-2017-machin-trans,
author = "{Moussallem}, D. and {Wauer}, M. and {Ngonga Ngomo}, A.-C.",
title = "{Machine Translation Using Semantic Web Technologies: A
Survey}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171109476M",
archivePrefix= "arXiv",
eprint = "1711.09476",
keywords = "Computer Science - Computation and Language",
month = nov,
primaryClass = "cs.CL"
}
@ARTICLE{smith-2015-cyclic-lr,
author = "{Smith}, L.~N.",
title = "{Cyclical Learning Rates for Training Neural Networks}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150601186S",
archivePrefix= "arXiv",
eprint = "1506.01186",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Learning, Computer Science - Neural and
Evolutionary Computing",
month = jun,
primaryClass = "cs.CV"
}
@ARTICLE{bubeck-2014-convex-optim,
author = "{Bubeck}, S.",
title = "{Convex Optimization: Algorithms and Complexity}",
journal = "ArXiv e-prints",
year = 2014,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1405.4980B",
archivePrefix= "arXiv",
eprint = "1405.4980",
keywords = "Mathematics - Optimization and Control, Computer Science -
Computational Complexity, Computer Science - Learning,
Computer Science - Numerical Analysis, Statistics - Machine
Learning",
month = may,
primaryClass = "math.OC"
}
@ARTICLE{gu-2017-non-autor,
author = "{Gu}, J. and {Bradbury}, J. and {Xiong}, C. and {Li},
V.~O.~K. and {Socher}, R.",
title = "{Non-Autoregressive Neural Machine Translation}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171102281G",
archivePrefix= "arXiv",
eprint = "1711.02281",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = nov,
primaryClass = "cs.CL"
}
@article{kalman-1996-singularly-valuable,
title = {A singularly valuable decomposition: the SVD of a matrix},
author = {Kalman, Dan},
journal = "",
year = 1996
}
@ARTICLE{niu-2011-hogwil,
author = "{Niu}, F. and {Recht}, B. and {Re}, C. and {Wright}, S.~J.",
title = "{HOGWILD!: A Lock-Free Approach To Parallelizing Stochastic
Gradient Descent}",
journal = "ArXiv e-prints",
year = 2011,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2011arXiv1106.5730N",
archivePrefix= "arXiv",
eprint = "1106.5730",
keywords = "Mathematics - Optimization and Control, Computer Science -
Learning",
month = jun,
primaryClass = "math.OC"
}
@ARTICLE{theis-2015-note-evaluat-gener,
author = "{Theis}, L. and {van den Oord}, A. and {Bethge}, M.",
title = "{A Note on the Evaluation of Generative models}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv151101844T",
archivePrefix= "arXiv",
eprint = "1511.01844",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
month = nov,
primaryClass = "stat.ML"
}
@ARTICLE{sutherland-2016-gener-model,
author = "{Sutherland}, D.~J. and {Tung}, H.-Y. and {Strathmann},
H. and {De}, S. and {Ramdas}, A. and {Smola}, A. and
{Gretton}, A.",
title = "{Generative Models and Model Criticism Via Optimized Maximum
Mean Discrepancy}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv161104488S",
archivePrefix= "arXiv",
eprint = "1611.04488",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Learning, Computer Science -
Neural and Evolutionary Computing, Statistics - Methodology",
month = nov,
primaryClass = "stat.ML"
}
@ARTICLE{yang-2016-multi-task,
author = "{Yang}, Z. and {Salakhutdinov}, R. and {Cohen}, W.",
title = "{Multi-Task Cross-Lingual Sequence Tagging From Scratch}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160306270Y",
archivePrefix= "arXiv",
eprint = "1603.06270",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning",
month = mar,
primaryClass = "cs.CL"
}
@ARTICLE{dhingra-2016-tweet,
author = "{Dhingra}, B. and {Zhou}, Z. and {Fitzpatrick}, D. and
{Muehl}, M. and {Cohen}, W.~W.",
title = "{Tweet2Vec: Character-Based Distributed Representations for
Social Media}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160503481D",
archivePrefix= "arXiv",
eprint = "1605.03481",
keywords = "Computer Science - Learning, Computer Science - Computation
and Language",
month = may,
primaryClass = "cs.LG"
}
@inproceedings{coates-2011-text-detection,
title = "Text detection and character recognition in scene images with
unsupervised feature learning",
author = "Coates, Adam and Carpenter, Blake and Case, Carl and
Satheesh, Sanjeev and Suresh, Bipin and Wang, Tao and Wu,
David J and Ng, Andrew Y",
booktitle = "Document Analysis and Recognition (ICDAR), 2011 International
Conference on",
pages = "440-445",
year = 2011,
organization = "IEEE"
}
@inproceedings{zhang-2013-hmsearch,
title = "Hmsearch: An efficient hamming distance query processing
algorithm",
author = "Zhang, Xiaoyang and Qin, Jianbin and Wang, Wei and Sun,
Yifang and Lu, Jiaheng",
booktitle = "Proceedings of the 25th International Conference on
Scientific and Statistical Database Management",
pages = 19,
year = 2013,
organization = "ACM"
}
@misc{mueen-2017-fastest-similarity,
title = "The Fastest Similarity Search Algorithm for Time Series
Subsequences under Euclidean Distance",
author = "Mueen, Abdullah and Zhu, Yan and Yeh, Michael and Kamgar,
Kaveh and Viswanathan, Krishnamurthy and Gupta, Chetan and
Keogh, Eamonn",
year = 2017,
month = "August",
note =
"\url{http://www.cs.unm.edu/~mueen/FastestSimilaritySearch.html}"
}
@article{hyyro-2001-explaining-extending,
title = {Explaining and extending the bit-parallel approximate string
matching algorithm of Myers},
author = {Hyyr{\"o}, Heikki},
year = 2001,
publisher = {Citeseer},
journal = ""
}
@inproceedings{askitis-2007-hat-trie,
title = "HAT-trie: a cache-conscious trie-based data structure for
strings",
author = "Askitis, Nikolas and Sinha, Ranjan",
booktitle = "Proceedings of the thirtieth Australasian conference on
Computer science-Volume 62",
pages = "97-105",
year = 2007,
organization = "Australian Computer Society, Inc."
}
@techreport{bagwell-2001-ideal-hash-trees,
title = {Ideal hash trees},
author = {Bagwell, Phil},
year = 2001,
institution = ""
}
@article{van-2014-accelerating-t-sne,
title = "Accelerating t-SNE using tree-based algorithms.",
author = "Van Der Maaten, Laurens",
journal = "Journal of machine learning research",
volume = 15,
number = 1,
pages = "3221-3245",
year = 2014
}
@article{tibshirani-2001-estimating-number,
title = "Estimating the number of clusters in a data set via the gap
statistic",
author = "Tibshirani, Robert and Walther, Guenther and Hastie, Trevor",
journal = "Journal of the Royal Statistical Society: Series B
(Statistical Methodology)",
volume = 63,
number = 2,
pages = "411-423",
year = 2001,
publisher = "Wiley Online Library"
}
@article{schmidhuber-1992-learning-factorial-codes,
title = "Learning factorial codes by predictability minimization",
author = "Schmidhuber, J{\"u}rgen",
journal = "Neural Computation",
volume = 4,
number = 6,
pages = "863-879",
year = 1992,
publisher = "MIT Press"
}
@article{maaten-2008-visualizing-data,
title = "Visualizing data using t-SNE",
author = "Maaten, Laurens van der and Hinton, Geoffrey",
journal = "Journal of machine learning research",
volume = 9,
number = "Nov",
pages = "2579-2605",
year = 2008
}
@ARTICLE{kingma-2013-auto-encoding,
author = "{Kingma}, D.~P and {Welling}, M.",
title = "{Auto-Encoding Variational Bayes}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1312.6114",
primaryClass = "stat.ML",
keywords = "Statistics - Machine Learning, Computer Science - Learning",
year = 2013,
month = dec,
adsurl = "http://adsabs.harvard.edu/abs/2013arXiv1312.6114K",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{rezende-2014-stochastic-backpropagation,
author = "{Jimenez Rezende}, D. and {Mohamed}, S. and {Wierstra}, D.",
title = "{Stochastic Backpropagation and Approximate Inference in Deep
Generative Models}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1401.4082",
primaryClass = "stat.ML",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Learning, Statistics -
Computation, Statistics - Methodology",
year = 2014,
month = jan,
adsurl = "http://adsabs.harvard.edu/abs/2014arXiv1401.4082J",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{roberts-2017-cross-validation,
title = "Cross-validation strategies for data with temporal, spatial,
hierarchical, or phylogenetic structure",
author = "Roberts, David R and Bahn, Volker and Ciuti, Simone and
Boyce, Mark S and Elith, Jane and Guillera-Arroita, Gurutzeta
and Hauenstein, Severin and Lahoz-Monfort, Jos{\'e} J and
Schr{\"o}der, Boris and Thuiller, Wilfried and others",
journal = "Ecography",
volume = 40,
number = 8,
pages = "913-929",
year = 2017,
publisher = "Wiley Online Library"
}
@article{zhang-2004-optimality-navie,
title = "The optimality of naive Bayes",
author = "Zhang, Harry",
journal = "AA",
volume = 1,
number = 2,
pages = 3,
year = 2004
}
@inproceedings{zheng-2013-deep-learning,
title = "Deep learning for Chinese word segmentation and POS tagging",
author = "Zheng, Xiaoqing and Chen, Hanyang and Xu, Tianyu",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in
Natural Language Processing",
pages = "647-657",
year = 2013
}
@article{黄昌宁-2007-中文分词十年回顾,
title = "中文分词十年回顾",
author = "黄昌宁 and 赵海 and others",
journal = "中文信息学报",
volume = 21,
number = 3,
pages = "8-19",
year = 2007
}
@article{张博-2006-对互联网环境下中文分词系统的一种架构改进,
title = "对互联网环境下中文分词系统的一种架构改进",
author = "张博 and 姜建国 and 万平国",
journal = "计算机应用研究",
volume = 11,
pages = "176-178",
year = 2006
}
@phdthesis{孙茂松-2001-汉语自动分词研究评述,
title = {汉语自动分词研究评述},
author = {孙茂松 and 邹嘉彦},
school = "",
year = 2001
}
@article{赵伟-2004-一种规则与统计相结合的汉语分词方法,
title = "一种规则与统计相结合的汉语分词方法",
author = "赵伟 and 戴新宇 and 尹存燕 and 陈家骏",
journal = "计算机应用研究",
volume = 21,
number = 3,
pages = "23-25",
year = 2004
}
@article{张华平-2004-基于角色标注的中国人名自动识别研究,
title = "基于角色标注的中国人名自动识别研究",
author = "张华平 and 刘群",
journal = "计算机学报",
volume = 27,
number = 1,
year = 2004
}
@article{孙宾-2003-现代汉语文本的词语切分技术,
title = "现代汉语文本的词语切分技术",
author = "孙宾",
journal = "技术报告, 北京大学计算语言学研究所",
year = 2003
}
@article{foo-2004-chinese-word,
title = "Chinese word segmentation and its effect on information
retrieval",
author = "Foo, Schubert and Li, Hui",
journal = "Information processing \& management",
volume = 40,
number = 1,
pages = "161-190",
year = 2004,
publisher = "Elsevier"
}
@inproceedings{peng-2004-chinese-segmentation,
title = "Chinese segmentation and new word detection using conditional
random fields",
author = "Peng, Fuchun and Feng, Fangfang and McCallum, Andrew",
booktitle = "Proceedings of the 20th international conference on
Computational Linguistics",
pages = 562,
year = 2004,
organization = "Association for Computational Linguistics"
}
@article{huang-2003-applying-machine,
title = "Applying machine learning to text segmentation for
information retrieval",
author = "Huang, Xiangji and Peng, Fuchun and Schuurmans, Dale and
Cercone, Nick and Robertson, Stephen E",
journal = "Information Retrieval",
volume = 6,
number = "3-4",
pages = "333-362",
year = 2003,
publisher = "Springer"
}
@inproceedings{jiang-2009-automatic-adaptation,
title = "Automatic adaptation of annotation standards: Chinese word
segmentation and POS tagging: a case study",
author = "Jiang, Wenbin and Huang, Liang and Liu, Qun",
booktitle = "Proceedings of the Joint Conference of the 47th Annual
Meeting of the ACL and the 4th International Joint Conference
on Natural Language Processing of the AFNLP: Volume 1-Volume
1",
pages = "522-530",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{sun-1998-chinese-word,
title = "Chinese word segmentation without using lexicon and
hand-crafted training data",
author = "Maosong, Sun and Dayang, Shen and Tsou, Benjamin K",
booktitle = "Proceedings of the 36th Annual Meeting of the Association for
Computational Linguistics and 17th International Conference
on Computational Linguistics-Volume 2",
pages = "1265-1271",
year = 1998,
organization = "Association for Computational Linguistics"
}
@article{俞士汶-2002-北京大学现代汉语语料库基本加工规范,
title = "北京大学现代汉语语料库基本加工规范 (续)",
author = "俞士汶 and 段慧明 and 朱学锋 and 孙斌",
journal = "中文信息学报",
volume = 16,
number = 6,
pages = "59-65",
year = 2002
}
@article{宋柔-1997-关于分词规范的探讨,
title = "关于分词规范的探讨",
author = "宋柔",
journal = "语言文字应用",
number = 3,
pages = "113-114",
year = 1997
}
@article{孙茂松-2001-信息处理用词汇研究,
title = "信息处理用现代汉语分词词表",
author = "孙茂松 and 王洪君 and 李行健 and 富丽 and 黄昌宁 and 陈松岑
and谢自立 and 张卫国",
journal = "语言文字应用",
number = 4,
pages = "84-89",
year = 2001
}
@article{李玉梅-2007-分词规范亟需补充的三方面内容,
title = "分词规范亟需补充的三方面内容",
author = "李玉梅 and 陈晓 and 姜自霞 and 易江燕 and 靳光瑾 and 黄昌宁",
journal = "中文信息学报",
volume = 21,
number = 5,
pages = "1-7",
year = 2007
}
@article{刘荣-2011-利用统计量和语言学规则提取多字词表达,
title = "利用统计量和语言学规则提取多字词表达",
author = "刘荣 and 王奕凯",
journal = "太原理工大學學報",
volume = 42,
number = 2,
pages = "133-137",
year = 2011,
publisher = "太原理工大學學報編輯部"
}
@inproceedings{zhao-2017-ngram2vec,
title = "Ngram2vec: Learning Improved Word Representations from Ngram
Co-occurrence Statistics",
author = "Zhao, Zhe and Liu, Tao and Li, Shen and Li, Bofang and Du,
Xiaoyong",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
pages = "244-253",
year = 2017
}
@ARTICLE{pawar-2017-relation-extraction,
author = "{Pawar}, S. and {Palshikar}, G.~K. and {Bhattacharyya}, P.",
title = "{Relation Extraction : A Survey}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1712.05191",
primaryClass = "cs.CL",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Information Retrieval",
year = 2017,
month = dec,
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171205191P",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{mintz-2009-distant-supervision,
title = "Distant supervision for relation extraction without labeled
data",
author = "Mintz, Mike and Bills, Steven and Snow, Rion and Jurafsky,
Dan",
booktitle = "Proceedings of the Joint Conference of the 47th Annual
Meeting of the ACL and the 4th International Joint Conference
on Natural Language Processing of the AFNLP: Volume 2-Volume
2",
pages = "1003-1011",
year = 2009,
organization = "Association for Computational Linguistics"
}
@article{王丽杰-2009-基于SVMTool的中文词性标注,
author = "王丽杰 and 车万翔 and 刘挺",
title = "基于SVMTool的中文词性标注",
publisher = "中文信息学报",
year = 2009,
journal = "中文信息学报",
volume = 23,
number = 4,
eid = 16,
numpages = 6,
pages = 16,
keywords = "计算机应用;中文信息处理;词性标注;SVMTool;未登录词;偏旁部首",
url = "http://jcip.cipsc.org.cn/CN/abstract/article_1212.shtml"
}
@ARTICLE{sutton-2010-intro-cond,
author = "{Sutton}, C. and {McCallum}, A.",
title = "{An Introduction to Conditional Random Fields}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1011.4088",
primaryClass = "stat.ML",
keywords = "Statistics - Machine Learning",
year = 2010,
month = nov,
adsurl = "http://adsabs.harvard.edu/abs/2010arXiv1011.4088S",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{lafferty-2001-cond-rand,
author = "Lafferty, John D. and McCallum, Andrew and Pereira, Fernando
C. N.",
title = "Conditional Random Fields: Probabilistic Models for
Segmenting and Labeling Sequence Data",
booktitle = "Proceedings of the Eighteenth International Conference on
Machine Learning",
series = "ICML '01",
year = 2001,
isbn = "1-55860-778-1",
pages = "282-289",
numpages = 8,
url = "http://dl.acm.org/citation.cfm?id=645530.655813",
acmid = 655813,
publisher = "Morgan Kaufmann Publishers Inc.",
address = "San Francisco, CA, USA"
}
@inproceedings{sha-2003-shallow-parsing,
author = "Sha, Fei and Pereira, Fernando",
title = "Shallow Parsing with Conditional Random Fields",
booktitle = "Proceedings of the 2003 Conference of the North American
Chapter of the Association for Computational Linguistics on
Human Language Technology - Volume 1",
series = "NAACL '03",
year = 2003,
location = "Edmonton, Canada",
pages = "134-141",
numpages = 8,
url = "https://doi.org/10.3115/1073445.1073473",
doi = "10.3115/1073445.1073473",
acmid = 1073473,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@article{刘炜-2017-一种面向突发事件的文本语料自动标注方法,
author = "刘炜 and 王旭 and 张雨嘉 and 刘宗田",
title = "一种面向突发事件的文本语料自动标注方法",
publisher = "中文信息学报",
year = 2017,
journal = "中文信息学报",
volume = 31,
number = 2,
eid = 76,
numpages = 9,
pages = 76,
keywords = "突发事件;语料库;自动标注",
url = "http://jcip.cipsc.org.cn/CN/abstract/article_2360.shtml"
}
@ARTICLE{huang-2015-bidirect-lstm-crf,
author = "{Huang}, Z. and {Xu}, W. and {Yu}, K.",
title = "{Bidirectional LSTM-CRF Models for Sequence Tagging}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1508.01991",
primaryClass = "cs.CL",
keywords = "Computer Science - Computation and Language",
year = 2015,
month = aug,
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150801991H",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{askitis-2005-cache-conscious,
title = "Cache-conscious collision resolution in string hash tables",
author = "Askitis, Nikolas and Zobel, Justin",
booktitle = "International Symposium on String Processing and Information
Retrieval",
pages = "91-102",
year = 2005,
organization = "Springer"
}
@article{王厚峰-2002-指代消解的基本方法和实现技术,
title = "指代消解的基本方法和实现技术",
author = "王厚峰",
journal = "中文信息学报",
volume = 16,
number = 6,
pages = "10-18",
year = 2002
}
@inproceedings{ahn-2006-stage-event,
author = "Ahn, David",
title = "The Stages of Event Extraction",
booktitle = "Proceedings of the Workshop on Annotating and Reasoning About
Time and Events",
series = "ARTE '06",
year = 2006,
isbn = "1-932432-81-7",
location = "Sydney, Australia",
pages = "1-8",
numpages = 8,
url = "http://dl.acm.org/citation.cfm?id=1629235.1629236",
acmid = 1629236,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@article{赵妍妍-2008-中文事件抽取技术研究,
title = "中文事件抽取技术研究",
author = "赵妍妍 and 秦兵 and 车万翔 and 刘挺",
journal = "中文信息学报",
volume = 22,
number = 1,
pages = "3-8",
year = 2008
}
@article{李颖-2017-中文开放式多元实体关系抽取,
title = "中文开放式多元实体关系抽取",
author = "李颖 and 郝晓燕 and 王勇",
journal = "计算机科学",
number = "S1",
pages = "80-83",
year = 2017
}
@inproceedings{takamatsu-2012-reducing-wrong,
author = "Takamatsu, Shingo and Sato, Issei and Nakagawa, Hiroshi",
title = "Reducing Wrong Labels in Distant Supervision for Relation
Extraction",
booktitle = "Proceedings of the 50th Annual Meeting of the Association for
Computational Linguistics: Long Papers - Volume 1",
series = "ACL '12",
year = 2012,
location = "Jeju Island, Korea",
pages = "721-729",
numpages = 9,
url = "http://dl.acm.org/citation.cfm?id=2390524.2390626",
acmid = 2390626,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@inproceedings{yao-2010-collective-cross,
title = "Collective cross-document relation extraction without
labelled data",
author = "Yao, Limin and Riedel, Sebastian and McCallum, Andrew",
booktitle = "Proceedings of the 2010 Conference on Empirical Methods in
Natural Language Processing",
pages = "1013-1023",
year = 2010,
organization = "Association for Computational Linguistics"
}
@inproceedings{berant-2013-semantic-parsing,
title = "Semantic parsing on freebase from question-answer pairs",
author = "Berant, Jonathan and Chou, Andrew and Frostig, Roy and Liang,
Percy",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in
Natural Language Processing",
pages = "1533-1544",
year = 2013
}
@inproceedings{hoffmann-2011-knowledge-based,
title = "Knowledge-based weak supervision for information extraction
of overlapping relations",
author = "Hoffmann, Raphael and Zhang, Congle and Ling, Xiao and
Zettlemoyer, Luke and Weld, Daniel S",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies-Volume
1",
pages = "541-550",
year = 2011,
organization = "Association for Computational Linguistics"
}
@inproceedings{surdeanu-2012-multi-instance,
title = "Multi-instance multi-label learning for relation extraction",
author = "Surdeanu, Mihai and Tibshirani, Julie and Nallapati, Ramesh
and Manning, Christopher D",
booktitle = "Proceedings of the 2012 joint conference on empirical methods
in natural language processing and computational natural
language learning",
pages = "455-465",
year = 2012,
organization = "Association for Computational Linguistics"
}
@InProceedings{lin-2013-conv-neural,
author = "Liu, ChunYang and Sun, WenBo and Chao, WenHan and Che,
WanXiang",
editor = "Motoda, Hiroshi and Wu, Zhaohui and Cao, Longbing and Zaiane,
Osmar and Yao, Min and Wang, Wei",
title = "Convolution Neural Network for Relation Extraction",
booktitle = "Advanced Data Mining and Applications",
year = 2013,
publisher = "Springer Berlin Heidelberg",
address = "Berlin, Heidelberg",
pages = "231-242",
abstract = "Deep Neural Network has been applied to many Natural Language
Processing tasks. Instead of building hand-craft features,
DNN builds features by automatic learning, fitting different
domains well. In this paper, we propose a novel convolution
network, incorporating lexical features, applied to Relation
Extraction. Since many current deep neural networks use word
embedding by word table, which, however, neglects semantic
meaning among words, we import a new coding method, which
coding input words by synonym dictionary to integrate
semantic knowledge into the neural network. We compared our
Convolution Neural Network (CNN) on relation extraction with
the state-of-art tree kernel approach, including Typed
Dependency Path Kernel and Shortest Dependency Path Kernel
and Context-Sensitive tree kernel, resulting in a 9{\%}
improvement competitive performance on ACE2005 data
set. Also, we compared the synonym coding with the one-hot
coding, and our approach got 1.6{\%} improvement. Moreover,
we also tried other coding method, such as hypernym coding,
and give some discussion according the result.",
isbn = "978-3-642-53917-6"
}
@inproceedings{zeng-2014-relation-classification,
title = "Relation classification via convolutional deep neural
network",
author = "Zeng, Daojian and Liu, Kang and Lai, Siwei and Zhou, Guangyou
and Zhao, Jun",
booktitle = "Proceedings of COLING 2014, the 25th International Conference
on Computational Linguistics: Technical Papers",
pages = "2335-2344",
year = 2014
}
@inproceedings{nguyen-2015-relation-extraction,
title = "Relation extraction: Perspective from convolutional neural
networks",
author = "Nguyen, Thien Huu and Grishman, Ralph",
booktitle = "Proceedings of the 1st Workshop on Vector Space Modeling for
Natural Language Processing",
pages = "39-48",
year = 2015
}
@ARTICLE{nogueira-2015-class-relat,
author = "{Nogueira dos Santos}, C. and {Xiang}, B. and {Zhou}, B.",
title = "{Classifying Relations By Ranking With Convolutional Neural
Networks}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150406580N",
archivePrefix= "arXiv",
eprint = "1504.06580",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing",
month = apr,
primaryClass = "cs.CL"
}
@ARTICLE{zhang-2015-relat-class,
author = "{Zhang}, D. and {Wang}, D.",
title = "{Relation Classification Via Recurrent Neural Network}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150801006Z",
archivePrefix= "arXiv",
eprint = "1508.01006",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing",
month = aug,
primaryClass = "cs.CL"
}
@inproceedings{zhou-2016-attention-based,
title = "Attention-based bidirectional long short-term memory networks
for relation classification",
author = "Zhou, Peng and Shi, Wei and Tian, Jun and Qi, Zhenyu and Li,
Bingchen and Hao, Hongwei and Xu, Bo",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 2: Short Papers)",
volume = 2,
pages = "207-212",
year = 2016
}
@inproceedings{wang-2016-relation-classification,
title = "Relation classification via multi-level attention cnns",
author = "Wang, Linlin and Cao, Zhu and de Melo, Gerard and Liu,
Zhiyuan",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "1298-1307",
year = 2016
}
@inproceedings{zeng-2015-distant-supervision,
title = "Distant supervision for relation extraction via piecewise
convolutional neural networks",
author = "Zeng, Daojian and Liu, Kang and Chen, Yubo and Zhao, Jun",
booktitle = "Proceedings of the 2015 Conference on Empirical Methods in
Natural Language Processing",
pages = "1753-1762",
year = 2015
}
@inproceedings{jiang-2016-relation-extraction,
title = "Relation extraction with multi-instance multi-label
convolutional neural networks",
author = "Jiang, Xiaotian and Wang, Quan and Li, Peng and Wang, Bin",
booktitle = "Proceedings of COLING 2016, the 26th International Conference
on Computational Linguistics: Technical Papers",
pages = "1471-1480",
year = 2016
}
@inproceedings{ji-2017-distan-super,
title = "Distant Supervision for Relation Extraction with
Sentence-Level Attention and Entity Descriptions",
author = "Guoliang Ji and Kang Liu and Shizhu He and Jun Zhao",
booktitle = "AAAI",
year = 2017
}
@article{漆桂林-2017-知识图谱研究进展,
title = "知识图谱研究进展",
author = "漆桂林 and 高桓 and 吴天星",
journal = "情报工程",
volume = 3,
number = 1,
pages = "4-25",
year = 2017
}
@inproceedings{brin-1998-extrac,
title = "Extracting patterns and relations from the world wide web",
author = "Brin, Sergey",
booktitle = "International Workshop on The World Wide Web and Databases",
pages = "172-183",
year = 1998,
organization = "Springer"
}
@inproceedings{agichtein-2000-snowball,
title = "Snowball: Extracting relations from large plain-text
collections",
author = "Agichtein, Eugene and Gravano, Luis",
booktitle = "Proceedings of the fifth ACM conference on Digital libraries",
pages = "85-94",
year = 2000,
organization = "ACM"
}
@inproceedings{yates-2007-textrunner,
title = "Textrunner: open information extraction on the web",
author = "Yates, Alexander and Cafarella, Michael and Banko, Michele
and Etzioni, Oren and Broadhead, Matthew and Soderland,
Stephen",
booktitle = "Proceedings of Human Language Technologies: The Annual
Conference of the North American Chapter of the Association
for Computational Linguistics: Demonstrations",
pages = "25-26",
year = 2007,
organization = "Association for Computational Linguistics"
}
@inproceedings{bollegala-2009-measur-simil,
author = "Bollegala, Danushka T. and Matsuo, Yutaka and Ishizuka,
Mitsuru",
title = "Measuring the Similarity Between Implicit Semantic Relations
from the Web",
booktitle = "Proceedings of the 18th International Conference on World
Wide Web",
series = "WWW '09",
year = 2009,
isbn = "978-1-60558-487-4",
location = "Madrid, Spain",
pages = "651-660",
numpages = 10,
url = "http://doi.acm.org/10.1145/1526709.1526797",
doi = "10.1145/1526709.1526797",
acmid = 1526797,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "natural language processing, relational similarity, web
mining"
}
@inproceedings{bollegala-2010-relat-dualit,
author = "Bollegala, Danushka Tarupathi and Matsuo, Yutaka and
Ishizuka, Mitsuru",
title = "Relational Duality: Unsupervised Extraction of Semantic
Relations Between Entities on the Web",
booktitle = "Proceedings of the 19th International Conference on World
Wide Web",
series = "WWW '10",
year = 2010,
isbn = "978-1-60558-799-8",
location = "Raleigh, North Carolina, USA",
pages = "151-160",
numpages = 10,
url = "http://doi.acm.org/10.1145/1772690.1772707",
doi = "10.1145/1772690.1772707",
acmid = 1772707,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "relation extraction, relational duality, relational
similarity, web mining"
}
@inproceedings{batista-2015-semi-supervised,
title = "Semi-supervised bootstrapping of relationship extractors with
distributional semantics",
author = "Batista, David S and Martins, Bruno and Silva, M{\'a}rio J",
booktitle = "Proceedings of the 2015 Conference on Empirical Methods in
Natural Language Processing",
pages = "499-504",
year = 2015
}
@inproceedings{zhu-2009-statsnowball,
author = "Zhu, Jun and Nie, Zaiqing and Liu, Xiaojiang and Zhang, Bo
and Wen, Ji-Rong",
title = "StatSnowball: A Statistical Approach to Extracting Entity
Relationships",
booktitle = "Proceedings of the 18th International Conference on World
Wide Web",
series = "WWW '09",
year = 2009,
isbn = "978-1-60558-487-4",
location = "Madrid, Spain",
pages = "101-110",
numpages = 10,
url = "http://doi.acm.org/10.1145/1526709.1526724",
doi = "10.1145/1526709.1526724",
acmid = 1526724,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "Markov logic networks, relationship extraction, statistical
models"
}
@article{车万翔-2005-实体关系自动抽取,
title = "实体关系自动抽取",
author = "车万翔 and 刘挺 and 李生",
journal = "中文信息学报",
volume = 19,
number = 2,
pages = "2-7",
year = 2005
}
@article{田久乐-2010-基于同义词词林的词语相似度计算方法,
title = "基于同义词词林的词语相似度计算方法",
author = "田久乐 and 赵蔚",
journal = "吉林大学学报: 信息科学版",
number = 6,
pages = "602-608",
year = 2010
}
@ARTICLE{ratner-2017-snorkel,
author = "{Ratner}, A. and {Bach}, S.~H. and {Ehrenberg}, H. and
{Fries}, J. and {Wu}, S. and {R{\'e}}, C.",
title = "{Snorkel: Rapid Training Data Creation with Weak
Supervision}",
journal = "ArXiv e-prints",
archivePrefix= "arXiv",
eprint = "1711.10160",
primaryClass = "cs.LG",
keywords = "Computer Science - Learning, Statistics - Machine Learning",
year = 2017,
month = nov,
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv171110160R",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{bach-2017-learn-struc,
author = "{Bach}, S.~H. and {He}, B. and {Ratner}, A. and {R{\'e}}, C.",
title = "{Learning the Structure of Generative Models Without Labeled
Data}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170300854B",
archivePrefix= "arXiv",
eprint = "1703.00854",
keywords = "Computer Science - Learning, Statistics - Machine Learning",
month = mar,
primaryClass = "cs.LG"
}
@ARTICLE{ratner-2016-data-progr,
author = "{Ratner}, A. and {De Sa}, C. and {Wu}, S. and {Selsam},
D. and {R{\'e}}, C.",
title = "{Data Programming: Creating Large Training Sets, Quickly}",
journal = "ArXiv e-prints",
year = 2016,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160507723R",
archivePrefix= "arXiv",
eprint = "1605.07723",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Learning",
month = may,
primaryClass = "stat.ML"
}
@article{刘宗田-2009-面向事件的本体研究,
title = "面向事件的本体研究",
author = "刘宗田 and 黄美丽 and 周文 and 仲兆满 and 付剑锋 and 单建芳
and 智慧来",
journal = "计算机科学",
volume = 36,
number = 11,
pages = "189-192",
year = 2009
}
@article{pei-2004-mining,
title = "Mining sequential patterns by pattern-growth: The prefixspan
approach",
author = "Pei, Jian and Han, Jiawei and Mortazavi-Asl, Behzad and Wang,
Jianyong and Pinto, Helen and Chen, Qiming and Dayal,
Umeshwar and Hsu, Mei-Chun",
journal = "IEEE Transactions on knowledge and data engineering",
volume = 16,
number = 11,
pages = "1424-1440",
year = 2004,
publisher = "IEEE"
}
@article{李明耀-2016-基于依存分析的开放式中文实体关系抽取方法,
title = "基于依存分析的开放式中文实体关系抽取方法",
author = "李明耀 and 杨静",
journal = "计算机工程",
volume = 42,
number = 6,
pages = "201-207",
year = 2016
}
@inproceedings{ratinov-2009-design-challenges,
title = "Design challenges and misconceptions in named entity
recognition",
author = "Ratinov, Lev and Roth, Dan",
booktitle = "Proceedings of the Thirteenth Conference on Computational
Natural Language Learning",
pages = "147-155",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{dai-2015-enhan,
title = "Enhancing of chemical compound and drug name recognition
using representative tag scheme and fine-grained
tokenization",
author = "Hong-Jie Dai and Po-Ting Lai and Yung-Chun Chang and Richard
Tzong-Han Tsai",
booktitle = "J. Cheminformatics",
year = 2015
}
@ARTICLE{dyer-2015-stack-lstm,
author = "{Dyer}, C. and {Ballesteros}, M. and {Ling}, W. and
{Matthews}, A. and {Smith}, N.~A.",
title = "{Transition-Based Dependency Parsing With Stack Long
Short-Term Memory}",
journal = "ArXiv e-prints",
year = 2015,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2015arXiv150508075D",
archivePrefix= "arXiv",
eprint = "1505.08075",
keywords = "Computer Science - Computation and Language, Computer Science
- Learning, Computer Science - Neural and Evolutionary Computing",
month = may,
primaryClass = "cs.CL"
}
@inproceedings{ling-2015-not-all,
title = "Not all contexts are created equal: Better word
representations with variable attention",
author = "Ling, Wang and Tsvetkov, Yulia and Amir, Silvio and
Fermandez, Ramon and Dyer, Chris and Black, Alan W and
Trancoso, Isabel and Lin, Chu-Cheng",
booktitle = "Proceedings of the 2015 Conference on Empirical Methods in
Natural Language Processing",
pages = "1367-1372",
year = 2015
}
@article{koppel-2006-importance-neutral,
title = "The importance of neutral examples for learning sentiment",
author = "Koppel, Moshe and Schler, Jonathan",
journal = "Computational Intelligence",
volume = 22,
number = 2,
pages = "100-109",
year = 2006,
publisher = "Wiley Online Library"
}
@article{berger-1996-maximum-entropy,
title = "A maximum entropy approach to natural language processing",
author = "Berger, Adam L and Pietra, Vincent J Della and Pietra,
Stephen A Della",
journal = "Computational linguistics",
volume = 22,
number = 1,
pages = "39-71",
year = 1996,
publisher = "MIT Press"
}
@ARTICLE{Prescher-2004-tutoral,
author = "{Prescher}, D.",
title = "{A Tutorial on the Expectation-Maximization Algorithm
Including Maximum-Likelihood Estimation and EM Training of
Probabilistic Context-Free Grammars}",
journal = "eprint arXiv:cs/0412015",
eprint = "cs/0412015",
keywords = "Computer Science - Computation and Language",
year = 2004,
month = dec,
adsurl = "http://adsabs.harvard.edu/abs/2004cs.......12015P",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{berger-1997-improved-iterative,
title = "The improved iterative scaling algorithm: A gentle
introduction",
author = "Berger, Adam",
journal = "Unpublished manuscript",
year = 1997
}
@inproceedings{curran-2003-inves-gis,
author = "Curran, James R. and Clark, Stephen",
title = "Investigating GIS and Smoothing for Maximum Entropy Taggers",
booktitle = "Proceedings of the Tenth Conference on European Chapter of
the Association for Computational Linguistics - Volume 1",
series = "EACL '03",
year = 2003,
isbn = "1-333-56789-0",
location = "Budapest, Hungary",
pages = "91-98",
numpages = 8,
url = "https://doi.org/10.3115/1067807.1067821",
doi = "10.3115/1067807.1067821",
acmid = 1067821,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@article{张华平-2002-基于N-最短路径方法的中文词语粗分模型,
title = "基于N-最短路径方法的中文词语粗分模型",
author = "张华平 and 刘群",
journal = "中文信息学报",
volume = 16,
number = 5,
pages = "3-9",
year = 2002
}
@article{秦兵-2015-无指导的中文开放式实体关系抽取,
title = "无指导的中文开放式实体关系抽取",
author = "秦兵 and 刘安安 and 刘挺 and others",
journal = "计算机研究与发展",
volume = 52,
number = 5,
year = 2015,
pages = "1029-1035"
}
@inproceedings{li-2013-joint-event,
title = "Joint event extraction via structured prediction with global
features",
author = "Li, Qi and Ji, Heng and Huang, Liang",
booktitle = "Proceedings of the 51st Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "73-82",
year = 2013
}
@article{chen-2012-joint-modeling,
title = "Joint modeling for chinese event extraction with rich
linguistic features",
author = "Chen, Chen and Ng, Vincent",
journal = "Proceedings of COLING 2012",
pages = "529-544",
year = 2012
}
@inproceedings{singh-2013-joint-infer,
author = "Singh, Sameer and Riedel, Sebastian and Martin, Brian and
Zheng, Jiaping and McCallum, Andrew",
title = "Joint Inference of Entities, Relations, and Coreference",
booktitle = "Proceedings of the 2013 Workshop on Automated Knowledge Base
Construction",
series = "AKBC '13",
year = 2013,
isbn = "978-1-4503-2411-3",
location = "San Francisco, California, USA",
pages = "1-6",
numpages = 6,
url = "http://doi.acm.org/10.1145/2509558.2509559",
doi = "10.1145/2509558.2509559",
acmid = 2509559,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "coreference resolution, information extraction, joint
inference, named entity recognition, relation extraction"
}
@inproceedings{riedel-2011-fast-robust,
author = "Riedel, Sebastian and McCallum, Andrew",
title = "Fast and Robust Joint Models for Biomedical Event Extraction",
booktitle = "Proceedings of the Conference on Empirical Methods in Natural
Language Processing",
series = "EMNLP '11",
year = 2011,
isbn = "978-1-937284-11-4",
location = "Edinburgh, United Kingdom",
pages = "1-12",
numpages = 12,
url = "http://dl.acm.org/citation.cfm?id=2145432.2145434",
acmid = 2145434,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@article{何馨宇-2017-基于双向LSTM和两阶段方法的触发词识别,
author = "何馨宇 and 李丽双",
title = "基于双向LSTM和两阶段方法的触发词识别",
publisher = "中文信息学报",
year = 2017,
journal = "中文信息学报",
volume = 31,
number = 6,
eid = 147,
numpages = 7,
pages = 147,
keywords = "触发词识别;两阶段方法;双向LSTM;依存词向量",
url = "http://jcip.cipsc.org.cn/CN/abstract/article_2482.shtml"
}
@ARTICLE{cai-2017-fast-accur,
author = "{Cai}, D. and {Zhao}, H. and {Zhang}, Z. and {Xin}, Y. and
{Wu}, Y. and {Huang}, F.",
title = "{Fast and Accurate Neural Word Segmentation for Chinese}",
journal = "ArXiv e-prints",
year = 2017,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2017arXiv170407047C",
archivePrefix= "arXiv",
eprint = "1704.07047",
keywords = "Computer Science - Computation and Language",
month = apr,
primaryClass = "cs.CL"
}
@article{陈自岩-2016-一种非监督的事件触发词检测和分类方法,
title = "一种非监督的事件触发词检测和分类方法",
author = "陈自岩 and 黄宇 and 王洋 and 傅兴玉 and 付琨",
journal = "国外电子测量技术",
number = 7,
pages = "91-95",
year = 2016
}
@inproceedings{li-2014-increm,
title = "Incremental joint extraction of entity mentions and
relations",
author = "Li, Qi and Ji, Heng",
booktitle = "Proceedings of the 52nd Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "402-412",
year = 2014
}
@inproceedings{liu-2016-lever,
title = "Leveraging framenet to improve automatic event detection",
author = "Liu, Shulin and Chen, Yubo and He, Shizhu and Liu, Kang and
Zhao, Jun",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "2134-2143",
year = 2016
}
@article{ji-2008-refin,
title = "Refining event extraction through cross-document inference",
author = "Ji, Heng and Grishman, Ralph",
journal = "Proceedings of ACL-08: HLT",
pages = "254-262",
year = 2008
}
@inproceedings{chen-2009-language-specific,
title = "Language specific issue and feature exploration in Chinese
event extraction",
author = "Chen, Zheng and Ji, Heng",
booktitle = "Proceedings of Human Language Technologies: The 2009 Annual
Conference of the North American Chapter of the Association
for Computational Linguistics, Companion Volume: Short
Papers",
pages = "209-212",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{li-2012-employ-compositional,
title = "Employing compositional semantics and discourse consistency
in Chinese event extraction",
author = "Li, Peifeng and Zhou, Guodong and Zhu, Qiaoming and Hou,
Libin",
booktitle = "Proceedings of the 2012 Joint Conference on Empirical Methods
in Natural Language Processing and Computational Natural
Language Learning",
pages = "1006-1016",
year = 2012,
organization = "Association for Computational Linguistics"
}
@inproceedings{liao-2010-using,
title = "Using document level cross-event inference to improve event
extraction",
author = "Liao, Shasha and Grishman, Ralph",
booktitle = "Proceedings of the 48th Annual Meeting of the Association for
Computational Linguistics",
pages = "789-797",
year = 2010,
organization = "Association for Computational Linguistics"
}
@inproceedings{hong-2011-using,
title = "Using cross-entity inference to improve event extraction",
author = "Hong, Yu and Zhang, Jianfeng and Ma, Bin and Yao, Jianmin and
Zhou, Guodong and Zhu, Qiaoming",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies-Volume
1",
pages = "1127-1136",
year = 2011,
organization = "Association for Computational Linguistics"
}
@inproceedings{liu-2016-probab-soft,
author = "Liu, Shulin and Liu, Kang and He, Shizhu and Zhao, Jun",
title = "A Probabilistic Soft Logic Based Approach to Exploiting
Latent and Global Information in Event Classification",
booktitle = "Proceedings of the Thirtieth AAAI Conference on Artificial
Intelligence",
series = "AAAI'16",
year = 2016,
location = "Phoenix, Arizona",
pages = "2993-2999",
numpages = 7,
url = "http://dl.acm.org/citation.cfm?id=3016100.3016321",
acmid = 3016321,
publisher = "AAAI Press"
}
@article{kim-2000-subject-object,
title = "Subject/object drop in the acquisition of Korean: A
cross-linguistic comparison",
author = "Kim, Young-Joo",
journal = "Journal of East Asian Linguistics",
volume = 9,
number = 4,
pages = "325-351",
year = 2000,
publisher = "Springer"
}
@inproceedings{tan-2008-ident-chines,
author = "Tan, Hongye and Zhao, Tiejun and Zheng, Jiaheng",
title = "Identification of Chinese Event and Their Argument Roles",
booktitle = "Proceedings of the 2008 IEEE 8th International Conference on
Computer and Information Technology Workshops",
series = "CITWORKSHOPS '08",
year = 2008,
isbn = "978-0-7695-3242-4",
pages = "14-19",
numpages = 6,
url = "http://dx.doi.org/10.1109/CIT.2008.Workshops.54",
doi = "10.1109/CIT.2008.Workshops.54",
acmid = 1381056,
publisher = "IEEE Computer Society",
address = "Washington, DC, USA"
}
@inproceedings{fader-2013-parap,
title = "Paraphrase-driven learning for open question answering",
author = "Fader, Anthony and Zettlemoyer, Luke and Etzioni, Oren",
booktitle = "Proceedings of the 51st Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "1608-1618",
year = 2013
}
@article{陈箫箫-2016-微博中的开放域事件抽取,
title = "微博中的开放域事件抽取",
author = "陈箫箫 and 刘波",
journal = "计算机应用与软件",
volume = 33,
number = 8,
pages = "18-22",
year = 2016
}
@article{李江龙-2017-金融领域的事件句抽取,
title = "金融领域的事件句抽取",
author = "李江龙 and 吕学强 and 周建设 and 刘秀磊",
journal = "计算机应用研究",
volume = 34,
number = 10,
pages = "2915-2918",
year = 2017
}
@article{马晨曦-2018-基于递归神经网络的中文事件检测,
title = "基于递归神经网络的中文事件检测",
author = "马晨曦 and 陈兴蜀 and 王文贤 and 王海舟",
journal = "信息网络安全",
number = 5,
pages = "75-81",
year = 2018
}
@inproceedings{mcclosky-2011-event-extract,
author = "McClosky, David and Surdeanu, Mihai and Manning, Christopher
D.",
title = "Event Extraction As Dependency Parsing",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies -
Volume 1",
series = "HLT '11",
year = 2011,
isbn = "978-1-932432-87-9",
location = "Portland, Oregon",
pages = "1626-1635",
numpages = 10,
url = "http://dl.acm.org/citation.cfm?id=2002472.2002667",
acmid = 2002667,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@article{reschke-2014-event-extrac,
title = "Event Extraction Using Distant Supervision",
author = "Reschke, Kevin and Jankowiak, MarVn and Surdeanu, Mihai and
Manning, Christopher D and Jurafsky, Daniel",
journal = "Language",
year = 2014
}
@inproceedings{riedel-2013-relation-extraction,
title = "Relation extraction with matrix factorization and universal
schemas",
author = "Riedel, Sebastian and Yao, Limin and McCallum, Andrew and
Marlin, Benjamin M",
booktitle = "Proceedings of the 2013 Conference of the North American
Chapter of the Association for Computational Linguistics:
Human Language Technologies",
pages = "74-84",
year = 2013
}
@inproceedings{toutanova-2015-representing-text,
title = "Representing text for joint embedding of text and knowledge
bases",
author = "Toutanova, Kristina and Chen, Danqi and Pantel, Patrick and
Poon, Hoifung and Choudhury, Pallavi and Gamon, Michael",
booktitle = "Proceedings of the 2015 Conference on Empirical Methods in
Natural Language Processing",
pages = "1499-1509",
year = 2015
}
@inproceedings{tang-2005-email,
title = "Email data cleaning",
author = "Tang, Jie and Li, Hang and Cao, Yunbo and Tang, Zhaohui",
booktitle = "Proceedings of the eleventh ACM SIGKDD international
conference on Knowledge discovery in data mining",
pages = "489-498",
year = 2005,
organization = "ACM"
}
@inproceedings{smith-2007-tesser-ocr,
title = "An overview of the Tesseract OCR engine",
author = "Smith, Ray",
booktitle = "Document Analysis and Recognition, 2007. ICDAR 2007. Ninth
International Conference on",
volume = 2,
pages = "629-633",
year = 2007,
organization = "IEEE"
}
@inproceedings{smith-2009-hybrid-page,
title = "Hybrid Page Layout Analysis via Tab-Stop Detection",
author = "Ray Smith",
year = 2009,
URL = "http://www.cvc.uab.es/icdar2009/papers/3725a241.pdf",
booktitle = "Proceedings of the 10th international conference on document
analysis and recognition"
}
@inproceedings{epshtein-2010-detect-text,
title = "Detecting text in natural scenes with stroke width transform",
author = "Epshtein, Boris and Ofek, Eyal and Wexler, Yonatan",
booktitle = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE
Conference on",
pages = "2963-2970",
year = 2010,
organization = "IEEE"
}
@Article{ramakrishnan-2012-layout-pdf,
author = "Ramakrishnan, Cartic and Patnia, Abhishek and Hovy, Eduard
and Burns, Gully APC",
title = "Layout-aware text extraction from full-text PDF of scientific
articles",
journal = "Source Code for Biology and Medicine",
year = 2012,
month = "May",
day = 28,
volume = 7,
number = 1,
pages = 7,
abstract = "The Portable Document Format (PDF) is the most commonly used
file format for online scientific publications. The absence
of effective means to extract text from these PDF files in a
layout-aware manner presents a significant challenge for
developers of biomedical text mining or biocuration
informatics systems that use published literature as an
information source. In this paper we introduce the
`Layout-Aware PDF Text Extraction' (LA-PDFText) system to
facilitate accurate extraction of text from PDF files of
research articles for use in text mining applications.",
issn = "1751-0473",
doi = "10.1186/1751-0473-7-7",
url = "https://doi.org/10.1186/1751-0473-7-7"
}
@ARTICLE{niklaus-2018-survey-open-infor-extrac,
author = "{Niklaus}, C. and {Cetto}, M. and {Freitas}, A. and
{Handschuh}, S.",
title = "{A Survey on Open Information Extraction}",
journal = "ArXiv e-prints",
year = 2018,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2018arXiv180605599N",
archivePrefix= "arXiv",
eprint = "1806.05599",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL"
}
@Article{nesterov-2015-univer,
author = "Nesterov, Yu",
title = "Universal gradient methods for convex optimization problems",
journal = "Mathematical Programming",
year = 2015,
month = "Aug",
day = 01,
volume = 152,
number = 1,
pages = "381-404",
abstract = "In this paper, we present new methods for black-box convex
minimization. They do not need to know in advance the actual
level of smoothness of the objective function. Their only
essential input parameter is the required accuracy of the
solution. At the same time, for each particular problem class
they automatically ensure the best possible rate of
convergence. We confirm our theoretical results by
encouraging numerical experiments, which demonstrate that the
fast rate of convergence, typical for the smooth optimization
problems, sometimes can be achieved even on nonsmooth problem
instances.",
issn = "1436-4646",
doi = "10.1007/s10107-014-0790-0",
url = "https://doi.org/10.1007/s10107-014-0790-0"
}
@inproceedings{nothman-2018-stop-word,
title = "Stop Word Lists in Free Open-source Software Packages",
author = "Nothman, Joel and Qin, Hanmin and Yurchak, Roman",
booktitle = "Proceedings of Workshop for NLP Open Source Software
(NLP-OSS)",
pages = "7-12",
year = 2018
}
@inproceedings{shi-2009-hash,
title = "Hash kernels",
author = "Shi, Qinfeng and Petterson, James and Dror, Gideon and
Langford, John and Smola, Alex and Strehl, Alex and
Vishwanathan, Vishy",
booktitle = "Artificial intelligence and statistics",
pages = "496-503",
year = 2009
}
@article{weinberger-2009-featur-hashin,
author = "Kilian Q. Weinberger and Anirban Dasgupta and Josh Attenberg
and John Langford and Alexander J. Smola",
title = "Feature Hashing for Large Scale Multitask Learning",
journal = "CoRR",
volume = "abs/0902.2206",
year = 2009,
url = "http://arxiv.org/abs/0902.2206",
archivePrefix= "arXiv",
eprint = "0902.2206",
timestamp = "Mon, 13 Aug 2018 16:48:03 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-0902-2206",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{freksen-2018-fully-under-hashin-trick,
author = "Casper Benjamin Freksen and Lior Kamma and Kasper Green
Larsen",
title = "Fully Understanding the Hashing Trick",
journal = "CoRR",
volume = "abs/1805.08539",
year = 2018,
url = "http://arxiv.org/abs/1805.08539",
archivePrefix= "arXiv",
eprint = "1805.08539",
timestamp = "Mon, 13 Aug 2018 16:49:00 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1805-08539",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{peters-2018-deep,
author = "Matthew E. Peters and Mark Neumann and Mohit Iyyer and Matt
Gardner and Christopher Clark and Kenton Lee and Luke
Zettlemoyer",
title = "Deep contextualized word representations",
journal = "CoRR",
volume = "abs/1802.05365",
year = 2018,
url = "http://arxiv.org/abs/1802.05365",
archivePrefix= "arXiv",
eprint = "1802.05365",
timestamp = "Mon, 13 Aug 2018 16:48:54 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1802-05365",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{bengio-2003-neural-probab-languag-model,
author = "Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal
and Janvin, Christian",
title = "A Neural Probabilistic Language Model",
journal = "J. Mach. Learn. Res.",
issue_date = "3/1/2003",
volume = 3,
month = mar,
year = 2003,
issn = "1532-4435",
pages = "1137-1155",
numpages = 19,
url = "http://dl.acm.org/citation.cfm?id=944919.944966",
acmid = 944966,
publisher = "JMLR.org"
}
@article{devlin-2018-bert,
author = "Jacob Devlin and Ming{-}Wei Chang and Kenton Lee and Kristina
Toutanova",
title = "{BERT:} Pre-training of Deep Bidirectional Transformers for
Language Understanding",
journal = "CoRR",
volume = "abs/1810.04805",
year = 2018,
url = "http://arxiv.org/abs/1810.04805",
archivePrefix= "arXiv",
eprint = "1810.04805",
timestamp = "Tue, 30 Oct 2018 20:39:56 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1810-04805",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{jozefowicz-2016-explor-limit-languag-model,
author = "Rafal J{\'{o}}zefowicz and Oriol Vinyals and Mike Schuster
and Noam Shazeer and Yonghui Wu",
title = "Exploring the Limits of Language Modeling",
journal = "CoRR",
volume = "abs/1602.02410",
year = 2016,
url = "http://arxiv.org/abs/1602.02410",
archivePrefix= "arXiv",
eprint = "1602.02410",
timestamp = "Mon, 13 Aug 2018 16:48:43 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/JozefowiczVSSW16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{vania-2017-from-charac-words-between,
author = "Clara Vania and Adam Lopez",
title = "From Characters to Words to in Between: Do We Capture
Morphology?",
journal = "CoRR",
volume = "abs/1704.08352",
year = 2017,
url = "http://arxiv.org/abs/1704.08352",
archivePrefix= "arXiv",
eprint = "1704.08352",
timestamp = "Mon, 13 Aug 2018 16:46:32 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/VaniaL17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{lei-2017-train-rnns-fast-cnns,
author = "Tao Lei and Yu Zhang and Yoav Artzi",
title = "Training RNNs as Fast as CNNs",
journal = "CoRR",
volume = "abs/1709.02755",
year = 2017,
url = "http://arxiv.org/abs/1709.02755",
archivePrefix= "arXiv",
eprint = "1709.02755",
timestamp = "Mon, 13 Aug 2018 16:46:29 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1709-02755",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{tang-2018-why-self-atten,
author = "Gongbo Tang and Matthias M{\"{u}}ller and Annette Rios and
Rico Sennrich",
title = "Why Self-Attention? {A} Targeted Evaluation of Neural Machine
Translation Architectures",
journal = "CoRR",
volume = "abs/1808.08946",
year = 2018,
url = "http://arxiv.org/abs/1808.08946",
archivePrefix= "arXiv",
eprint = "1808.08946",
timestamp = "Mon, 03 Sep 2018 07:29:38 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1808-08946",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@InProceedings{domhan-2018-how-much,
author = "Domhan, Tobias",
title = "How Much Attention Do You Need? A Granular Analysis of Neural
Machine Translation Architectures",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
year = 2018,
publisher = "Association for Computational Linguistics",
pages = "1799-1808",
location = "Melbourne, Australia",
url = "http://aclweb.org/anthology/P18-1167"
}
@inproceedings{li-2015-word-embed-revis,
author = "Li, Yitan and Xu, Linli and Tian, Fei and Jiang, Liang and
Zhong, Xiaowei and Chen, Enhong",
title = "Word Embedding Revisited: A New Representation Learning and
Explicit Matrix Factorization Perspective",
booktitle = "Proceedings of the 24th International Conference on
Artificial Intelligence",
series = "IJCAI'15",
year = 2015,
isbn = "978-1-57735-738-4",
location = "Buenos Aires, Argentina",
pages = "3650-3656",
numpages = 7,
url = "http://dl.acm.org/citation.cfm?id=2832747.2832758",
acmid = 2832758,
publisher = "AAAI Press"
}
@article{evans-2000-frequen,
title = "Frequency versus probability formats in statistical word
problems",
journal = "Cognition",
volume = 77,
number = 3,
pages = "197-213",
year = 2000,
issn = "0010-0277",
doi = "https://doi.org/10.1016/S0010-0277(00)00098-6",
url =
"http://www.sciencedirect.com/science/article/pii/S0010027700000986",
author = "Jonathan St.B.T Evans and Simon J Handley and Nick Perham and
David E Over and Valerie A Thompson",
keywords = "Frequency, Probability, Statistical word problems",
abstract = "Three experiments examined people's ability to incorporate
base rate information when judging posterior
probabilities. Specifically, we tested the (Cosmides, L., \&
Tooby, J. (1996). Are humans good intuitive statisticians
after all? Rethinking some conclusions from the literature on
judgement under uncertainty. Cognition, 58, 1–73) conclusion
that people's reasoning appears to follow Bayesian principles
when they are presented with information in a frequency
format, but not when information is presented as one case
probabilities. First, we found that frequency formats were
not generally associated with better performance than
probability formats unless they were presented in a manner
which facilitated construction of a set inclusion mental
model. Second, we demonstrated that the use of frequency
information may promote biases in the weighting of
information. When participants are asked to express their
judgements in frequency rather than probability format, they
were more likely to produce the base rate as their answer,
ignoring diagnostic evidence."
}
@article{griffin-1999-frequen-probab-predic,
title = "Frequency, Probability, and Prediction: Easy Solutions to
Cognitive Illusions?",
journal = "Cognitive Psychology",
volume = 38,
number = 1,
pages = "48-78",
year = 1999,
issn = "0010-0285",
doi = "https://doi.org/10.1006/cogp.1998.0707",
url =
"http://www.sciencedirect.com/science/article/pii/S0010028598907071",
author = "Dale Griffin and Roger Buehler",
abstract = "Many errors in probabilistic judgment have been attributed to
people's inability to think in statistical terms when faced
with information about a single case. Prior theoretical
analyses and empirical results imply that the errors
associated with case-specific reasoning may be reduced when
people make frequentistic predictions about a set of
cases. In studies of three previously identified cognitive
biases, we find that frequency-based predictions are
different from—but no better than—case-specific judgments of
probability. First, in studies of the “planning fallacy,” we
compare the accuracy of aggregate frequency and case-specific
probability judgments in predictions of students' real-life
projects. When aggregate and single-case predictions are
collected from different respondents, there is little
difference between the two: Both are overly optimistic and
show little predictive validity. However, in within-subject
comparisons, the aggregate judgments are significantly more
conservative than the single-case predictions, though still
optimistically biased. Results from studies of overconfidence
in general knowledge and base rate neglect in categorical
prediction underline a general conclusion. Frequentistic
predictions made for sets of events are no more statistically
sophisticated, nor more accurate, than predictions made for
individual events using subjective probability."
}
@article{xie-2017-data-noisin,
author = "Ziang Xie and Sida I. Wang and Jiwei Li and Daniel L{\'{e}}vy
and Aiming Nie and Dan Jurafsky and Andrew Y. Ng",
title = "Data Noising as Smoothing in Neural Network Language Models",
journal = "CoRR",
volume = "abs/1703.02573",
year = 2017,
url = "http://arxiv.org/abs/1703.02573",
archivePrefix= "arXiv",
eprint = "1703.02573",
timestamp = "Mon, 13 Aug 2018 16:47:17 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/XieWLLNJN17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{do-2008-what,
title = "What is the expectation maximization algorithm?",
author = "Do, Chuong B and Batzoglou, Serafim",
journal = "Nature biotechnology",
volume = 26,
number = 8,
pages = 897,
year = 2008,
publisher = "Nature Publishing Group"
}
@article{wei-2019-eda,
author = "Jason W. Wei and Kai Zou",
title = "{EDA:} Easy Data Augmentation Techniques for Boosting
Performance on Text Classification Tasks",
journal = "CoRR",
volume = "abs/1901.11196",
year = 2019,
url = "http://arxiv.org/abs/1901.11196",
archivePrefix= "arXiv",
eprint = "1901.11196",
timestamp = "Mon, 04 Feb 2019 08:11:03 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1901-11196",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{sennrich-2015-improv,
title = "Improving neural machine translation models with monolingual
data",
author = "Sennrich, Rico and Haddow, Barry and Birch, Alexandra",
journal = "arXiv preprint arXiv:1511.06709",
year = 2015
}
@article{smith-2017-dont-decay,
author = "Samuel L. Smith and Pieter{-}Jan Kindermans and Quoc V. Le",
title = "Don't Decay the Learning Rate, Increase the Batch Size",
journal = "CoRR",
volume = "abs/1711.00489",
year = 2017,
url = "http://arxiv.org/abs/1711.00489",
archivePrefix= "arXiv",
eprint = "1711.00489",
timestamp = "Mon, 13 Aug 2018 16:46:33 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1711-00489",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{su-2016-differ-equat,
author = "Weijie Su and Stephen Boyd and Emmanuel J. C and {{\`e}}s",
title = "A Differential Equation for Modeling Nesterov's Accelerated
Gradient Method: Theory and Insights",
journal = "Journal of Machine Learning Research",
year = 2016,
volume = 17,
number = 153,
pages = "1-43",
url = "http://jmlr.org/papers/v17/15-084.html"
}
@article{arora-2012-mwum,
title = "The multiplicative weights update method: a meta-algorithm
and applications",
author = "Arora, Sanjeev and Hazan, Elad and Kale, Satyen",
journal = "Theory of Computing",
volume = 8,
number = 1,
pages = "121-164",
year = 2012,
publisher = "Theory of Computing Exchange"
}
@article{li-2018-deep-reinf-learn,
author = "Yuxi Li",
title = "Deep Reinforcement Learning",
journal = "CoRR",
volume = "abs/1810.06339",
year = 2018,
url = "http://arxiv.org/abs/1810.06339",
archivePrefix= "arXiv",
eprint = "1810.06339",
timestamp = "Tue, 30 Oct 2018 20:39:56 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1810-06339",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{balles-2017-follow-signs,
author = "Lukas Balles and Philipp Hennig",
title = "Follow the Signs for Robust Stochastic Optimization",
journal = "CoRR",
volume = "abs/1705.07774",
year = 2017,
url = "http://arxiv.org/abs/1705.07774",
archivePrefix= "arXiv",
eprint = "1705.07774",
timestamp = "Mon, 13 Aug 2018 16:48:00 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/BallesH17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{sutskever-2013-impor-initial,
author = "Sutskever, Ilya and Martens, James and Dahl, George and
Hinton, Geoffrey",
title = "On the Importance of Initialization and Momentum in Deep
Learning",
booktitle = "Proceedings of the 30th International Conference on
International Conference on Machine Learning - Volume 28",
series = "ICML'13",
year = 2013,
location = "Atlanta, GA, USA",
pages = "III-1139--III-1147",
url = "http://dl.acm.org/citation.cfm?id=3042817.3043064",
acmid = 3043064,
publisher = "JMLR.org"
}
@article{inoue-2018-data-augmen,
author = "Hiroshi Inoue",
title = "Data Augmentation by Pairing Samples for Images
Classification",
journal = "CoRR",
volume = "abs/1801.02929",
year = 2018,
url = "http://arxiv.org/abs/1801.02929",
archivePrefix= "arXiv",
eprint = "1801.02929",
timestamp = "Mon, 13 Aug 2018 16:46:20 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1801-02929",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{zhang-2017-mixup,
author = "Hongyi Zhang and Moustapha Ciss{\'{e}} and Yann N. Dauphin
and David Lopez{-}Paz",
title = "mixup: Beyond Empirical Risk Minimization",
journal = "CoRR",
volume = "abs/1710.09412",
year = 2017,
url = "http://arxiv.org/abs/1710.09412",
archivePrefix= "arXiv",
eprint = "1710.09412",
timestamp = "Mon, 13 Aug 2018 16:47:14 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1710-09412",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{cubuk-2018-autoaugment,
author = "Ekin Dogus Cubuk and Barret Zoph and Dandelion Man{\'{e}} and
Vijay Vasudevan and Quoc V. Le",
title = "AutoAugment: Learning Augmentation Policies from Data",
journal = "CoRR",
volume = "abs/1805.09501",
year = 2018,
url = "http://arxiv.org/abs/1805.09501",
archivePrefix= "arXiv",
eprint = "1805.09501",
timestamp = "Mon, 13 Aug 2018 16:48:44 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1805-09501",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{li-2018-under-dishar,
author = "Xiang Li and Shuo Chen and Xiaolin Hu and Jian Yang",
title = "Understanding the Disharmony between Dropout and Batch
Normalization by Variance Shift",
journal = "CoRR",
volume = "abs/1801.05134",
year = 2018,
url = "http://arxiv.org/abs/1801.05134",
archivePrefix= "arXiv",
eprint = "1801.05134",
timestamp = "Fri, 21 Dec 2018 14:34:10 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1801-05134",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{bergstra-2012-random-searc-hyper-optim,
author = "Bergstra, James and Bengio, Yoshua",
title = "Random Search for Hyper-parameter Optimization",
journal = "J. Mach. Learn. Res.",
issue_date = "January 2012",
volume = 13,
number = 1,
month = feb,
year = 2012,
issn = "1532-4435",
pages = "281-305",
numpages = 25,
url = "http://dl.acm.org/citation.cfm?id=2503308.2188395",
acmid = 2188395,
publisher = "JMLR.org",
keywords = "deep learning, global optimization, model selection, neural
networks, response surface modeling"
}
@article{masters-2018-revis-small,
author = "Dominic Masters and Carlo Luschi",
title = "Revisiting Small Batch Training for Deep Neural Networks",
journal = "CoRR",
volume = "abs/1804.07612",
year = 2018,
url = "http://arxiv.org/abs/1804.07612",
archivePrefix= "arXiv",
eprint = "1804.07612",
timestamp = "Mon, 13 Aug 2018 16:48:13 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-07612",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{tsuruoka-2009-sgd-l1,
title = "Stochastic gradient descent training for l1-regularized
log-linear models with cumulative penalty",
author = "Tsuruoka, Yoshimasa and Tsujii, Jun'ichi and Ananiadou,
Sophia",
booktitle = "Proceedings of the Joint Conference of the 47th Annual
Meeting of the ACL and the 4th International Joint Conference
on Natural Language Processing of the AFNLP: Volume 1-Volume
1",
pages = "477-485",
year = 2009,
organization = "Association for Computational Linguistics"
}
@inproceedings{wilson-2017-margin-value,
title = "The Marginal Value of Adaptive Gradient Methods in Machine
Learning",
author = "Wilson, Ashia C and Roelofs, Rebecca and Stern, Mitchell and
Srebro, Nati and Recht, Benjamin",
booktitle = "Advances in Neural Information Processing Systems 30",
editor = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett",
pages = "4148-4158",
year = 2017,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/7003-the-marginal-value-of-adaptive-gradient-methods-in-machine-learning.pdf"
}
@inproceedings{hoffer-2017-train-longer,
title = "Train longer, generalize better: closing the generalization
gap in large batch training of neural networks",
author = "Hoffer, Elad and Hubara, Itay and Soudry, Daniel",
booktitle = "Advances in Neural Information Processing Systems",
pages = "1731-1741",
year = 2017
}
@inproceedings{santurkar-2018-how,
title = "How does batch normalization help optimization?",
author = "Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew
and Madry, Aleksander",
booktitle = "Advances in Neural Information Processing Systems",
pages = "2483-2493",
year = 2018
}
@article{breiman-2001-statistical-modeling,
title = "Statistical modeling: The two cultures (with comments and a
rejoinder by the author)",
author = "Breiman, Leo and others",
journal = "Statistical science",
volume = 16,
number = 3,
pages = "199-231",
year = 2001,
publisher = "Institute of Mathematical Statistics"
}
@article{howard-2018-fine-lang,
author = "Jeremy Howard and Sebastian Ruder",
title = "Fine-tuned Language Models for Text Classification",
journal = "CoRR",
volume = "abs/1801.06146",
year = 2018,
url = "http://arxiv.org/abs/1801.06146",
archivePrefix= "arXiv",
eprint = "1801.06146",
timestamp = "Mon, 13 Aug 2018 16:46:54 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1801-06146",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{bai-2018-tcn,
author = "Shaojie Bai and J. Zico Kolter and Vladlen Koltun",
title = "An Empirical Evaluation of Generic Convolutional and
Recurrent Networks for Sequence Modeling",
journal = "CoRR",
volume = "abs/1803.01271",
year = 2018,
url = "http://arxiv.org/abs/1803.01271",
archivePrefix= "arXiv",
eprint = "1803.01271",
timestamp = "Mon, 13 Aug 2018 16:47:39 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1803-01271",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{nemirovski-2009-robust,
title = "Robust stochastic approximation approach to stochastic
programming",
author = "Nemirovski, Arkadi and Juditsky, Anatoli and Lan, Guanghui
and Shapiro, Alexander",
journal = "SIAM Journal on optimization",
volume = 19,
number = 4,
pages = "1574-1609",
year = 2009,
publisher = "SIAM"
}
@article{todorov-2016-optim,
year = 2016,
title = "Optimal control theory",
author = "Todorov, Emanuel",
journal = "Bayesian brain: probabilistic approaches to neural coding",
pages = "269-298",
publisher = "MIT Press Cambridge (Massachusetts)"
}
@article{zhao-2019-chines-word-segmen,
author = "Hai Zhao and Deng Cai and Changning Huang and Chunyu Kit",
title = "Chinese Word Segmentation: Another Decade Review
{(2007-2017)}",
journal = "CoRR",
volume = "abs/1901.06079",
year = 2019,
url = "http://arxiv.org/abs/1901.06079",
archivePrefix= "arXiv",
eprint = "1901.06079",
timestamp = "Fri, 01 Feb 2019 13:39:59 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1901-06079",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{johnson-2013-svrg,
title = "Accelerating stochastic gradient descent using predictive
variance reduction",
author = "Johnson, Rie and Zhang, Tong",
booktitle = "Advances in neural information processing systems",
pages = "315-323",
year = 2013
}
@article{defazio-2014-saga,
author = "Aaron Defazio and Francis R. Bach and Simon Lacoste{-}Julien",
title = "{SAGA:} {A} Fast Incremental Gradient Method With Support for
Non-Strongly Convex Composite Objectives",
journal = "CoRR",
volume = "abs/1407.0202",
year = 2014,
url = "http://arxiv.org/abs/1407.0202",
archivePrefix= "arXiv",
eprint = "1407.0202",
timestamp = "Mon, 13 Aug 2018 16:46:52 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/DefazioBL14",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{pinter-2017-mimic-word,
author = "Yuval Pinter and Robert Guthrie and Jacob Eisenstein",
title = "Mimicking Word Embeddings using Subword RNNs",
journal = "CoRR",
volume = "abs/1707.06961",
year = 2017,
url = "http://arxiv.org/abs/1707.06961",
archivePrefix= "arXiv",
eprint = "1707.06961",
timestamp = "Mon, 13 Aug 2018 16:46:53 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/PinterGE17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{kiperwasser-2016-simpl-accur,
title = "Simple and Accurate Dependency Parsing Using Bidirectional
LSTM Feature Representations",
author = "Kiperwasser, Eliyahu and Goldberg, Yoav",
journal = "Transactions of the Association for Computational
Linguistics",
volume = 4,
year = 2016,
url = "https://www.aclweb.org/anthology/Q16-1023",
pages = "313-327",
abstract = "We present a simple and effective scheme for dependency
parsing which is based on bidirectional-LSTMs (BiLSTMs). Each
sentence token is associated with a BiLSTM vector
representing the token in its sentential context, and feature
vectors are constructed by concatenating a few BiLSTM
vectors. The BiLSTM is trained jointly with the parser
objective, resulting in very effective feature extractors for
parsing. We demonstrate the effectiveness of the approach by
applying it to a greedy transition-based parser as well as to
a globally optimized graph-based parser. The resulting
parsers have very simple architectures, and match or surpass
the state-of-the-art accuracies on English and Chinese."
}
@article{baltescu-2014-pragm,
title = "Pragmatic neural language modelling in machine translation",
author = "Baltescu, Paul and Blunsom, Phil",
journal = "arXiv preprint arXiv:1412.7119",
year = 2014
}
@article{cooijmans-2016-recur-batch-normal,
author = "Tim Cooijmans and Nicolas Ballas and C{\'{e}}sar Laurent and
Aaron C. Courville",
title = "Recurrent Batch Normalization",
journal = "CoRR",
volume = "abs/1603.09025",
year = 2016,
url = "http://arxiv.org/abs/1603.09025",
archivePrefix= "arXiv",
eprint = "1603.09025",
timestamp = "Mon, 13 Aug 2018 16:48:30 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/CooijmansBLC16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{dehghani-2018-univer-trans,
author = "Mostafa Dehghani and Stephan Gouws and Oriol Vinyals and
Jakob Uszkoreit and Lukasz Kaiser",
title = "Universal Transformers",
journal = "CoRR",
volume = "abs/1807.03819",
year = 2018,
url = "http://arxiv.org/abs/1807.03819",
archivePrefix= "arXiv",
eprint = "1807.03819",
timestamp = "Mon, 13 Aug 2018 16:49:11 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1807-03819",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{dai-2019-trans-xl,
author = "Zihang Dai and Zhilin Yang and Yiming Yang and Jaime
G. Carbonell and Quoc V. Le and Ruslan Salakhutdinov",
title = "Transformer-XL: Attentive Language Models Beyond a
Fixed-Length Context",
journal = "CoRR",
volume = "abs/1901.02860",
year = 2019,
url = "http://arxiv.org/abs/1901.02860",
archivePrefix= "arXiv",
eprint = "1901.02860",
timestamp = "Fri, 01 Feb 2019 13:39:59 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1901-02860",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{popel-2018-train-tips-trans-model,
author = "Martin Popel and Ondrej Bojar",
title = "Training Tips for the Transformer Model",
journal = "CoRR",
volume = "abs/1804.00247",
year = 2018,
url = "http://arxiv.org/abs/1804.00247",
archivePrefix= "arXiv",
eprint = "1804.00247",
timestamp = "Mon, 13 Aug 2018 16:47:13 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-00247",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{liu-2019-linguis-knowl,
author = "Nelson F. Liu and Matt Gardner and Yonatan Belinkov and
Matthew Peters and Noah A. Smith",
title = "Linguistic Knowledge and Transferability of Contextual
Representations",
journal = "CoRR",
volume = "abs/1903.08855",
year = 2019,
url = "http://arxiv.org/abs/1903.08855",
archivePrefix= "arXiv",
eprint = "1903.08855",
timestamp = "Mon, 01 Apr 2019 14:07:37 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1903-08855",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{peters-2019-to-tune-not-tune,
author = "Matthew Peters and Sebastian Ruder and Noah A. Smith",
title = "To Tune or Not to Tune? Adapting Pretrained Representations
to Diverse Tasks",
journal = "CoRR",
volume = "abs/1903.05987",
year = 2019,
url = "http://arxiv.org/abs/1903.05987",
archivePrefix= "arXiv",
eprint = "1903.05987",
timestamp = "Sun, 31 Mar 2019 19:01:24 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1903-05987",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{li-2011-learn-to-rank,
title = "A short introduction to learning to rank",
author = "Li, Hang",
journal = "IEICE TRANSACTIONS on Information and Systems",
volume = 94,
number = 10,
pages = "1854-1862",
year = 2011,
publisher = "The Institute of Electronics, Information and Communication
Engineers"
}
@article{burges-2010-from-ranknet,
title = "From ranknet to lambdarank to lambdamart: An overview",
author = "Burges, Christopher JC",
journal = "Learning",
volume = 11,
number = "23-581",
pages = 81,
year = 2010
}
@Article{geurts-2006-extreme,
author = "Geurts, Pierre and Ernst, Damien and Wehenkel, Louis",
title = "Extremely randomized trees",
journal = "Machine Learning",
year = 2006,
month = "Apr",
day = 01,
volume = 63,
number = 1,
pages = "3-42",
abstract = "This paper proposes a new tree-based ensemble method for
supervised classification and regression problems. It
essentially consists of randomizing strongly both attribute
and cut-point choice while splitting a tree node. In the
extreme case, it builds totally randomized trees whose
structures are independent of the output values of the
learning sample. The strength of the randomization can be
tuned to problem specifics by the appropriate choice of a
parameter. We evaluate the robustness of the default choice
of this parameter, and we also provide insight on how to
adjust it in particular situations. Besides accuracy, the
main strength of the resulting algorithm is computational
efficiency. A bias/variance analysis of the Extra-Trees
algorithm is also provided as well as a geometrical and a
kernel characterization of the models induced.",
issn = "1573-0565",
doi = "10.1007/s10994-006-6226-1",
url = "https://doi.org/10.1007/s10994-006-6226-1"
}
@ARTICLE{chase-2014-thres-class,
author = "{Chase Lipton}, Zachary and {Elkan}, Charles and
{Narayanaswamy}, Balakrishnan",
title = "{Thresholding Classifiers to Maximize F1 Score}",
journal = "arXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Information
Retrieval, Computer Science - Machine Learning",
year = 2014,
month = "Feb",
eid = "arXiv:1402.1892",
pages = "arXiv:1402.1892",
archivePrefix= "arXiv",
eprint = "1402.1892",
primaryClass = "stat.ML",
adsurl = "https://ui.adsabs.harvard.edu/abs/2014arXiv1402.1892C",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{baak-2018-phik,
author = "{Baak}, M. and {Koopman}, R. and {Snoek}, H. and {Klous}, S.",
title = "{A new correlation coefficient between categorical, ordinal
and interval variables with Pearson characteristics}",
journal = "arXiv e-prints",
keywords = "Statistics - Methodology",
year = 2018,
month = "Nov",
eid = "arXiv:1811.11440",
pages = "arXiv:1811.11440",
archivePrefix= "arXiv",
eprint = "1811.11440",
primaryClass = "stat.ME",
adsurl = "https://ui.adsabs.harvard.edu/abs/2018arXiv181111440B",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{li-2008-learn-rank,
title = "Learning to Rank Using Classification and Gradient Boosting",
author = "Ping Li",
booktitle = "NIPS 2008",
year = 2008
}
@inproceedings{li-2007-mcrank,
title = "McRank: Learning to Rank Using Multiple Classification and
Gradient Boosting",
author = "P. H. W. Li and Christopher J. C. Burges and Qiang Wu",
booktitle = "NIPS",
year = 2007
}
@inproceedings{ke-2017-lightgbm,
title = "Lightgbm: A highly efficient gradient boosting decision tree",
author = "Ke, Guolin and Meng, Qi and Finley, Thomas and Wang, Taifeng
and Chen, Wei and Ma, Weidong and Ye, Qiwei and Liu, Tie-Yan",
booktitle = "Advances in Neural Information Processing Systems",
pages = "3146-3154",
year = 2017
}
@incollection{schapire-2013-explain-adaboost,
title = "Explaining adaboost",
author = "Schapire, Robert E",
booktitle = "Empirical inference",
pages = "37-52",
year = 2013,
publisher = "Springer"
}
@inproceedings{pardoe-2010-boost-regres-trans,
author = "Pardoe, David and Stone, Peter",
title = "Boosting for Regression Transfer",
booktitle = "Proceedings of the 27th International Conference on
International Conference on Machine Learning",
series = "ICML'10",
year = 2010,
isbn = "978-1-60558-907-7",
location = "Haifa, Israel",
pages = "863-870",
numpages = 8,
url = "http://dl.acm.org/citation.cfm?id=3104322.3104432",
acmid = 3104432,
publisher = "Omnipress",
address = "USA"
}
@article{dorogush-2018-catboost,
author = "Anna Veronika Dorogush and Vasily Ershov and Andrey Gulin",
title = "CatBoost: gradient boosting with categorical features
support",
journal = "CoRR",
volume = "abs/1810.11363",
year = 2018,
url = "http://arxiv.org/abs/1810.11363",
archivePrefix= "arXiv",
eprint = "1810.11363",
timestamp = "Wed, 31 Oct 2018 14:24:29 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1810-11363",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{freund-1997-decis-theor,
title = "A Decision-Theoretic Generalization of On-Line Learning and
an Application to Boosting",
journal = "Journal of Computer and System Sciences",
volume = 55,
number = 1,
pages = "119-139",
year = 1997,
issn = "0022-0000",
doi = "https://doi.org/10.1006/jcss.1997.1504",
url =
"http://www.sciencedirect.com/science/article/pii/S002200009791504X",
author = "Yoav Freund and Robert E Schapire",
abstract = "In the first part of the paper we consider the problem of
dynamically apportioning resources among a set of options in
a worst-case on-line framework. The model we study can be
interpreted as a broad, abstract extension of the
well-studied on-line prediction model to a general
decision-theoretic setting. We show that the multiplicative
weight-update Littlestone–Warmuth rule can be adapted to this
model, yielding bounds that are slightly weaker in some
cases, but applicable to a considerably more general class of
learning problems. We show how the resulting learning
algorithm can be applied to a variety of problems, including
gambling, multiple-outcome prediction, repeated games, and
prediction of points in Rn. In the second part of the paper
we apply the multiplicative weight-update technique to derive
a new boosting algorithm. This boosting algorithm does not
require any prior knowledge about the performance of the weak
learning algorithm. We also study generalizations of the new
boosting algorithm to the problem of learning functions whose
range, rather than being binary, is an arbitrary finite set
or a bounded segment of the real line."
}
@inproceedings{niculescu-mizil-2005-predic,
title = "Predicting good probabilities with supervised learning",
author = "Niculescu-Mizil, Alexandru and Caruana, Rich",
booktitle = "Proceedings of the 22nd international conference on Machine
learning",
pages = "625-632",
year = 2005,
organization = "ACM"
}
@article{kaufman-2012-leakage,
title = "Leakage in data mining: Formulation, detection, and
avoidance",
author = "Kaufman, Shachar and Rosset, Saharon and Perlich, Claudia and
Stitelman, Ori",
journal = "ACM Transactions on Knowledge Discovery from Data (TKDD)",
volume = 6,
number = 4,
pages = 15,
year = 2012,
publisher = "ACM"
}
@article{micci-barreca-2001-target-encoding,
author = "Micci-Barreca, Daniele",
title = "A Preprocessing Scheme for High-cardinality Categorical
Attributes in Classification and Prediction Problems",
journal = "SIGKDD Explor. Newsl.",
issue_date = "July 2001",
volume = 3,
number = 1,
month = jul,
year = 2001,
issn = "1931-0145",
pages = "27-32",
numpages = 6,
url = "http://doi.acm.org/10.1145/507533.507538",
doi = "10.1145/507533.507538",
acmid = 507538,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "categorical attributes, empirical bayes, hierarchical
attributes, neural networks, predictive models"
}
@phdthesis{shi-2007-best,
title = "Best-first decision tree learning",
author = "Shi, Haijian",
year = 2007,
school = "The University of Waikato"
}
@article{fisher-1958-group-maxim-homog,
author = "Fisher, Walter D",
title = "On Grouping for Maximum Homogeneity",
journal = "Journal of the American statistical Association",
volume = 53,
number = 284,
pages = "789-798",
year = 1958,
publisher = "Taylor \\& Francis"
}
@article{friedman-2010-regul-paths,
author = "Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob",
title = "Regularization Paths for Generalized Linear Models Via
Coordinate Descent",
journal = "Journal of statistical software",
volume = 33,
number = 1,
pages = 1,
year = 2010,
publisher = "NIH Public Access"
}
@inproceedings{kerber-1992-chimerge,
author = "Kerber, Randy",
title = "Chimerge: Discretization of numeric attributes",
booktitle = "Proceedings of the tenth national conference on Artificial
intelligence",
year = 1992,
pages = "123-128",
organization = "Aaai Press"
}
@article{harrell-2017-regres-model-strat,
author = "Harrell Jr, Frank E",
title = "Regression Modeling Strategies",
journal = "BIOS",
volume = 330,
year = 2017
}
@article{ribeiro-2016-lime,
author = "Marco T{\'{u}}lio Ribeiro and Sameer Singh and Carlos
Guestrin",
title = "``Why Should {I} Trust You?'': Explaining the Predictions of
Any Classifier",
journal = "CoRR",
volume = "abs/1602.04938",
year = 2016,
url = "http://arxiv.org/abs/1602.04938",
archivePrefix= "arXiv",
eprint = "1602.04938",
timestamp = "Mon, 13 Aug 2018 16:49:09 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/RibeiroSG16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{lei-2016-dist-free,
author = "{Lei}, Jing and {G'Sell}, Max and {Rinaldo}, Alessandro and
{Tibshirani}, Ryan J. and {Wasserman}, Larry",
title = "{Distribution-Free Predictive Inference For Regression}",
journal = "arXiv e-prints",
keywords = "Statistics - Methodology, Mathematics - Statistics Theory,
Statistics - Machine Learning",
year = 2016,
month = "Apr",
eid = "arXiv:1604.04173",
pages = "arXiv:1604.04173",
archivePrefix= "arXiv",
eprint = "1604.04173",
primaryClass = "stat.ME",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160404173L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@incollection{lundberg-2017-unified-approac,
author = "Lundberg, Scott M and Lee, Su-In",
booktitle = "Advances in Neural Information Processing Systems 30",
editor = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett",
pages = "4765-4774",
publisher = "Curran Associates, Inc.",
title = "A Unified Approach to Interpreting Model Predictions",
url =
"http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf",
year = 2017
}
@inproceedings{kohavi-1995-study-cross,
author = "Kohavi, Ron",
title = "A Study of Cross-validation and Bootstrap for Accuracy
Estimation and Model Selection",
booktitle = "Proceedings of the 14th International Joint Conference on
Artificial Intelligence - Volume 2",
series = "IJCAI'95",
year = 1995,
isbn = "1-55860-363-8",
location = "Montreal, Quebec, Canada",
pages = "1137-1143",
numpages = 7,
url = "http://dl.acm.org/citation.cfm?id=1643031.1643047",
acmid = 1643047,
publisher = "Morgan Kaufmann Publishers Inc.",
address = "San Francisco, CA, USA"
}
@inproceedings{kanter-2015-deep,
author = "Kanter, James Max and Veeramachaneni, Kalyan",
title = "Deep feature synthesis: Towards automating data science
endeavors",
booktitle = "2015 IEEE International Conference on Data Science and
Advanced Analytics (DSAA)",
year = 2015,
pages = "1-10",
organization = "IEEE"
}
@article{yuan-2006-group-lasso,
author = "Yuan, Ming and Lin, Yi",
title = "Model Selection and Estimation in Regression With Grouped
Variables",
journal = "Journal of the Royal Statistical Society: Series B
(Statistical Methodology)",
volume = 68,
number = 1,
pages = "49-67",
year = 2006,
publisher = "Wiley Online Library"
}
@article{tibshirani-2005-fused-lasso,
author = "Tibshirani, Robert and Saunders, Michael and Rosset, Saharon
and Zhu, Ji and Knight, Keith",
title = "Sparsity and Smoothness Via the Fused Lasso",
journal = "Journal of the Royal Statistical Society: Series B
(Statistical Methodology)",
volume = 67,
number = 1,
pages = "91-108",
year = 2005,
publisher = "Wiley Online Library"
}
@ARTICLE{gregorutti-2013-correl,
author = "{Gregorutti}, Baptiste and {Michel}, Bertrand and
{Saint-Pierre}, Philippe",
title = "{Correlation and variable importance in random forests}",
journal = "arXiv e-prints",
keywords = "Statistics - Methodology",
year = 2013,
month = "Oct",
eid = "arXiv:1310.5726",
pages = "arXiv:1310.5726",
archivePrefix= "arXiv",
eprint = "1310.5726",
primaryClass = "stat.ME",
adsurl = "https://ui.adsabs.harvard.edu/abs/2013arXiv1310.5726G",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{he-2008-adasyn,
author = "He, Haibo and Bai, Yang and Garcia, Edwardo A and Li, Shutao",
title = "ADASYN: Adaptive synthetic sampling approach for imbalanced
learning",
booktitle = "2008 IEEE International Joint Conference on Neural Networks
(IEEE World Congress on Computational Intelligence)",
year = 2008,
pages = "1322-1328",
organization = "IEEE"
}
@inproceedings{han-2005-border-smote,
author = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
title = "Borderline-SMOTE: a new over-sampling method in imbalanced
data sets learning",
booktitle = "International conference on intelligent computing",
year = 2005,
pages = "878-887",
organization = "Springer"
}
@inproceedings{nguyen-2009-border,
author = "Nguyen, Hien M and Cooper, Eric W and Kamei, Katsuari",
title = "Borderline over-sampling for imbalanced data classification",
booktitle = "Proceedings: Fifth International Workshop on Computational
Intelligence \\& Applications",
year = 2009,
volume = 2009,
number = 1,
pages = "24-29",
organization = "IEEE SMC Hiroshima Chapter"
}
@article{last-2017-overs-imbal,
author = "Felix Last and Georgios Douzas and Fernando
Ba{\c{c}}{\~{a}}o",
title = "Oversampling for Imbalanced Learning Based on K-Means and
{SMOTE}",
journal = "CoRR",
volume = "abs/1711.00837",
year = 2017,
url = "http://arxiv.org/abs/1711.00837",
archivePrefix= "arXiv",
eprint = "1711.00837",
timestamp = "Wed, 10 Oct 2018 15:58:34 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1711-00837",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{mani-2003-knn,
author = "Mani, Inderjeet and Zhang, I",
title = "kNN approach to unbalanced data distributions: a case study
involving information extraction",
booktitle = "Proceedings of workshop on learning from imbalanced datasets",
year = 2003,
volume = 126
}
@article{tomek-1976-two-modif-cnn,
added-at = "2007-08-22T12:37:55.000+0200",
author = "Tomek, I.",
biburl =
"https://www.bibsonomy.org/bibtex/2523c1d70243d3fe9035269af8f6f5ecd/bsmyth",
description = "AI 2001 Elizabeth McKenna Barry Smyth",
interhash = "379fe276cf4a77f8fba21a949b2d72d6",
intrahash = "523c1d70243d3fe9035269af8f6f5ecd",
journal = "{IEEE Transactions on Systems, Man, and Cybernetics}",
keywords = "imported",
pages = "679-772",
timestamp = "2007-08-22T12:37:55.000+0200",
title = "{Two Modifications of CNN}",
volume = "7(2)",
year = 1976
}
@ARTICLE{wilson-1972-asymp-proper,
author = "D. L. {Wilson}",
journal = "IEEE Transactions on Systems, Man, and Cybernetics",
title = "Asymptotic Properties of Nearest Neighbor Rules Using Edited
Data",
year = 1972,
volume = "SMC-2",
number = 3,
pages = "408-421",
keywords = "Nearest neighbor searches;Random
variables;Convergence;Character recognition;Decoding;Pattern
recognition",
doi = "10.1109/TSMC.1972.4309137",
ISSN = "0018-9472",
month = "July"
}
@article{hand-1978-exper,
title = "Experiments on the edited condensed nearest neighbor rule",
journal = "Information Sciences",
volume = 14,
number = 3,
pages = "171-180",
year = 1978,
issn = "0020-0255",
doi = "https://doi.org/10.1016/0020-0255(78)90040-3",
url =
"http://www.sciencedirect.com/science/article/pii/0020025578900403",
author = "D.J. Hand and B.G. Batchelor",
abstract = "Tomek's preprocessing scheme is discussed for editing the
training set prior to analyzing it by Hart's condensed
nearest neighbor technique. Preprocessing was performed by a
κ-nearest-neighbor pdf estimation scheme, although other
methods are suggested in this paper. The procedure was
studied experimentally and was found to achieve a significant
reduction in the storage requirements of the CNN method while
maintaining approximately the same error rate, or even
improving it."
}
@article{hart-2006-conden-neares,
author = "Hart, P.",
title = "The Condensed Nearest Neighbor Rule (Corresp.)",
journal = "IEEE Trans. Inf. Theor.",
issue_date = "May 1968",
volume = 14,
number = 3,
month = sep,
year = 2006,
issn = "0018-9448",
pages = "515-516",
numpages = 2,
url = "https://doi.org/10.1109/TIT.1968.1054155",
doi = "10.1109/TIT.1968.1054155",
acmid = 2267647,
publisher = "IEEE Press",
address = "Piscataway, NJ, USA"
}
@inproceedings{kubat-1997-addres-curse,
title = "Addressing the Curse of Imbalanced Training Sets: One-Sided
Selection",
author = "Miroslav Kubat and Stan Matwin",
booktitle = "ICML",
year = 1997
}
@inproceedings{laurikkala-2001-improv,
title = "Improving identification of difficult small classes by
balancing class distribution",
author = "Laurikkala, Jorma",
booktitle = "Conference on Artificial Intelligence in Medicine in Europe",
pages = "63-66",
year = 2001,
organization = "Springer"
}
@article{smith-2014-instan-level,
author = "Smith, Michael R. and Martinez, Tony and Giraud-Carrier,
Christophe",
title = "An Instance Level Analysis of Data Complexity",
journal = "Mach. Learn.",
issue_date = "May 2014",
volume = 95,
number = 2,
month = may,
year = 2014,
issn = "0885-6125",
pages = "225-256",
numpages = 32,
url = "https://doi.org/10.1007/s10994-013-5422-z",
doi = "10.1007/s10994-013-5422-z",
acmid = 2843686,
publisher = "Kluwer Academic Publishers",
address = "Norwell, MA, USA",
keywords = "Data complexity, Dataset hardness, Instance hardness"
}
@article{batista-2004-study-behav,
author = "Batista, Gustavo EAPA and Prati, Ronaldo C and Monard, Maria
Carolina",
title = "A Study of the Behavior of Several Methods for Balancing
Machine Learning Training Data",
journal = "ACM SIGKDD explorations newsletter",
volume = 6,
number = 1,
pages = "20-29",
year = 2004,
publisher = "ACM"
}
@MISC{batista-2003-balan-train,
author = "Gustavo E. A. P. A. Batista and Ana L. C. Bazzan and Maria
Carolina Monard",
title = "Balancing Training Data for Automated Annotation of Keywords:
a Case Study",
year = 2003
}
@Article{andrieu-2003-introd-mcmc-machin-learn,
author = "Andrieu, Christophe and de Freitas, Nando and Doucet, Arnaud
and Jordan, Michael I.",
title = "An Introduction to MCMC for Machine Learning",
journal = "Machine Learning",
year = 2003,
month = "Jan",
day = 01,
volume = 50,
number = 1,
pages = "5-43",
abstract = "This purpose of this introductory paper is threefold. First,
it introduces the Monte Carlo method with emphasis on
probabilistic machine learning. Second, it reviews the main
building blocks of modern Markov chain Monte Carlo
simulation, thereby providing and introduction to the
remaining papers of this special issue. Lastly, it discusses
new interesting research horizons.",
issn = "1573-0565",
doi = "10.1023/A:1020281327116",
url = "https://doi.org/10.1023/A:1020281327116"
}
@article{scholkopf-2000-new-suppor-vector-algor,
title = "New Support Vector Algorithms",
author = "Sch{\\\"o}lkopf, Bernhard and Smola, Alex J and Williamson,
Robert C and Bartlett, Peter L",
journal = "Neural computation",
volume = 12,
number = 5,
pages = "1207-1245",
year = 2000,
publisher = "MIT Press"
}
@article{scholkopf-2001-estim-suppor,
title = "Estimating the Support of a High-Dimensional Distribution",
author = "Sch{\\\"o}lkopf, Bernhard and Platt, John C and Shawe-Taylor,
John and Smola, Alex J and Williamson, Robert C",
journal = "Neural computation",
volume = 13,
number = 7,
pages = "1443-1471",
year = 2001,
publisher = "MIT Press"
}
@article{lampert-2009-kernel-method-comput-vision,
author = "Lampert, Christoph H and others",
title = "Kernel Methods in Computer Vision",
journal = "Foundations and Trends{\\textregistered} in Computer Graphics
and Vision",
volume = 4,
number = 3,
pages = "193-285",
year = 2009,
publisher = "Now Publishers, Inc."
}
@article{tax-2004-suppor-vector-data-descr,
author = "Tax, David MJ and Duin, Robert PW",
title = "Support Vector Data Description",
journal = "Machine learning",
volume = 54,
number = 1,
pages = "45-66",
year = 2004,
publisher = "Springer"
}
@inproceedings{liu-2008-isolat,
title = "Isolation forest",
author = "Liu, Fei Tony and Ting, Kai Ming and Zhou, Zhi-Hua",
booktitle = "2008 Eighth IEEE International Conference on Data Mining",
pages = "413-422",
year = 2008,
organization = "IEEE"
}
@inproceedings{breunig-2000-lof,
title = "LOF: identifying density-based local outliers",
author = "Breunig, Markus M and Kriegel, Hans-Peter and Ng, Raymond T
and Sander, J{\"o}rg",
booktitle = "ACM sigmod record",
volume = 29,
number = 2,
pages = "93-104",
year = 2000,
organization = "ACM"
}
@article{goyal-2017-accur-large-minib-sgd,
author = "Priya Goyal and Piotr Doll{\'{a}}r and Ross B. Girshick and
Pieter Noordhuis and Lukasz Wesolowski and Aapo Kyrola and
Andrew Tulloch and Yangqing Jia and Kaiming He",
title = "Accurate, Large Minibatch {SGD:} Training ImageNet in 1 Hour",
journal = "CoRR",
volume = "abs/1706.02677",
year = 2017,
url = "http://arxiv.org/abs/1706.02677",
archivePrefix= "arXiv",
eprint = "1706.02677",
timestamp = "Mon, 13 Aug 2018 16:49:10 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/GoyalDGNWKTJH17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{howard-2018-univer-languag,
author = "{Howard}, Jeremy and {Ruder}, Sebastian",
title = "{Universal Language Model Fine-tuning for Text
Classification}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Machine Learning, Statistics - Machine Learning",
year = 2018,
month = "Jan",
eid = "arXiv:1801.06146",
pages = "arXiv:1801.06146",
archivePrefix= "arXiv",
eprint = "1801.06146",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2018arXiv180106146H",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article {griffiths-2004-finding,
author = "Griffiths, Thomas L. and Steyvers, Mark",
title = "Finding scientific topics",
volume = 101,
number = "suppl 1",
pages = "5228-5235",
year = 2004,
doi = "10.1073/pnas.0307752101",
publisher = "National Academy of Sciences",
abstract = "A first step in identifying the content of a document is
determining which topics that document addresses. We describe
a generative model for documents, introduced by Blei, Ng, and
Jordan [Blei, D. M., Ng, A. Y. \& Jordan, M. I. (2003)
J. Machine Learn. Res. 3, 993-1022], in which each document
is generated by choosing a distribution over topics and then
choosing each word in the document from a topic selected
according to this distribution. We then present a Markov
chain Monte Carlo algorithm for inference in this model. We
use this algorithm to analyze abstracts from PNAS by using
Bayesian model selection to establish the number of
topics. We show that the extracted topics capture meaningful
structure in the data, consistent with the class designations
provided by the authors of the articles, and outline further
applications of this analysis, including identifying
{\textquotedblleft}hot topics{\textquotedblright} by
examining temporal dynamics and tagging abstracts to
illustrate semantic content.",
issn = "0027-8424",
URL = "https://www.pnas.org/content/101/suppl_1/5228",
eprint = "https://www.pnas.org/content/101/suppl_1/5228.full.pdf",
journal = "Proceedings of the National Academy of Sciences"
}
@article{cui-2016-multi-scale,
author = "Zhicheng Cui and Wenlin Chen and Yixin Chen",
title = "Multi-Scale Convolutional Neural Networks for Time Series
Classification",
journal = "CoRR",
volume = "abs/1603.06995",
year = 2016,
url = "http://arxiv.org/abs/1603.06995",
archivePrefix= "arXiv",
eprint = "1603.06995",
timestamp = "Mon, 13 Aug 2018 16:47:13 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/CuiCC16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{liu-2019-roberta,
author = "Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
Luke Zettlemoyer and Veselin Stoyanov",
title = "RoBERTa: {A} Robustly Optimized {BERT} Pretraining Approach",
journal = "CoRR",
volume = "abs/1907.11692",
year = 2019,
url = "http://arxiv.org/abs/1907.11692",
archivePrefix= "arXiv",
eprint = "1907.11692",
timestamp = "Thu, 01 Aug 2019 08:59:33 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1907-11692",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{zhao-2012-moodlens,
author = "Zhao, Jichang and Dong, Li and Wu, Junjie and Xu, Ke",
title = "Moodlens: an emoticon-based sentiment analysis system for
chinese tweets",
booktitle = "Proceedings of the 18th ACM SIGKDD international conference
on Knowledge discovery and data mining",
year = 2012,
pages = "1528-1531",
organization = "ACM"
}
@article{hsu-2002-comparison-multi-svm,
title = "A comparison of methods for multiclass support vector
machines",
author = "Hsu, Chih-Wei and Lin, Chih-Jen",
journal = "IEEE transactions on Neural Networks",
volume = 13,
number = 2,
pages = "415-425",
year = 2002,
publisher = "IEEE"
}
@article{conneau-2018-what,
author = "Alexis Conneau and Germ{\'{a}}n Kruszewski and Guillaume
Lample and Lo{\"{\i}}c Barrault and Marco Baroni",
title = "What you can cram into a single vector: Probing sentence
embeddings for linguistic properties",
journal = "CoRR",
volume = "abs/1805.01070",
year = 2018,
url = "http://arxiv.org/abs/1805.01070",
archivePrefix= "arXiv",
eprint = "1805.01070",
timestamp = "Mon, 13 Aug 2018 16:48:39 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1805-01070",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@techreport{ester-1996-dbscan,
author = "Ester, M and Kriegel, HP and Sander, J and Xiaowei, Xu",
institution = "AAAI Press, Menlo Park, CA (United States)",
title = "A density-based algorithm for discovering clusters in large
spatial databases with noise",
year = 1996
}
@article{schubert-2017-dbscan-revisit,
author = "Schubert, Erich and Sander, J``{o}rg and Ester, Martin and
Kriegel, Hans Peter and Xu, Xiaowei",
title = "DBSCAN Revisited, Revisited: Why and How You Should (Still)
Use DBSCAN",
journal = "ACM Trans. Database Syst.",
issue_date = "August 2017",
volume = 42,
number = 3,
month = jul,
year = 2017,
issn = "0362-5915",
pages = "19:1--19:21",
articleno = 19,
numpages = 21,
url = "http://doi.acm.org/10.1145/3068335",
doi = "10.1145/3068335",
acmid = 3068335,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "DBSCAN, density-based clustering, range-search complexity"
}
@inproceedings{ng-2002-dis-vs-gen,
author = "Ng, Andrew Y and Jordan, Michael I",
title = "On discriminative vs. generative classifiers: A comparison of
logistic regression and naive bayes",
booktitle = "Advances in neural information processing systems",
year = 2002,
pages = "841-848"
}
@article{joulin-2016-fasttext-zip,
author = "Armand Joulin and Edouard Grave and Piotr Bojanowski and
Matthijs Douze and Herv{\'{e}} J{\'{e}}gou and Tomas Mikolov",
title = "FastText.zip: Compressing text classification models",
journal = "CoRR",
volume = "abs/1612.03651",
year = 2016,
url = "http://arxiv.org/abs/1612.03651",
archivePrefix= "arXiv",
eprint = "1612.03651",
timestamp = "Mon, 13 Aug 2018 16:48:53 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/JoulinGBDJM16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{weston-2014-tagspace,
title = "{\#}{T}ag{S}pace: Semantic Embeddings from Hashtags",
author = "Weston, Jason and Chopra, Sumit and Adams, Keith",
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in
Natural Language Processing ({EMNLP})",
month = oct,
year = 2014,
address = "Doha, Qatar",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D14-1194",
doi = "10.3115/v1/D14-1194",
pages = "1822-1827"
}
@article{li-2015-compon-enhan,
author = "Yanran Li and Wenjie Li and Fei Sun and Sujian Li",
title = "Component-Enhanced Chinese Character Embeddings",
journal = "CoRR",
volume = "abs/1508.06669",
year = 2015,
url = "http://arxiv.org/abs/1508.06669",
archivePrefix= "arXiv",
eprint = "1508.06669",
timestamp = "Mon, 13 Aug 2018 16:47:49 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/LiLSL15",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{chen-2015-joint-learn,
author = "Chen, Xinxiong and Xu, Lei and Liu, Zhiyuan and Sun, Maosong
and Luan, Huanbo",
title = "Joint Learning of Character and Word Embeddings",
booktitle = "Proceedings of the 24th International Conference on
Artificial Intelligence",
series = "IJCAI'15",
year = 2015,
isbn = "978-1-57735-738-4",
location = "Buenos Aires, Argentina",
pages = "1236-1242",
numpages = 7,
url = "http://dl.acm.org/citation.cfm?id=2832415.2832421",
acmid = 2832421,
publisher = "AAAI Press"
}
@article{kudo-2018-subword-regularization,
author = "Taku Kudo",
title = "Subword Regularization: Improving Neural Network Translation
Models with Multiple Subword Candidates",
journal = "CoRR",
volume = "abs/1804.10959",
year = 2018,
url = "http://arxiv.org/abs/1804.10959",
archivePrefix= "arXiv",
eprint = "1804.10959",
timestamp = "Mon, 13 Aug 2018 16:48:57 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-10959",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{wang-2016-atae-lstm,
author = "Wang, Yequan and Huang, Minlie and Zhao, Li and others",
title = "Attention-based LSTM for aspect-level sentiment
classification",
booktitle = "Proceedings of the 2016 conference on empirical methods in
natural language processing",
year = 2016,
pages = "606-615"
}
@article{tang-2015-td-lstm,
author = "Duyu Tang and Bing Qin and Xiaocheng Feng and Ting Liu",
title = "Target-Dependent Sentiment Classification with Long Short
Term Memory",
journal = "CoRR",
volume = "abs/1512.01100",
year = 2015,
url = "http://arxiv.org/abs/1512.01100",
archivePrefix= "arXiv",
eprint = "1512.01100",
timestamp = "Mon, 13 Aug 2018 16:46:55 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/TangQFL15",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{pang-2018-learn-repres,
author = "Guansong Pang and Longbing Cao and Ling Chen and Huan Liu",
title = "Learning Representations of Ultrahigh-dimensional Data for
Random Distance-based Outlier Detection",
journal = "CoRR",
volume = "abs/1806.04808",
year = 2018,
url = "http://arxiv.org/abs/1806.04808",
archivePrefix= "arXiv",
eprint = "1806.04808",
timestamp = "Mon, 13 Aug 2018 16:46:25 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1806-04808",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{liu-2003-build,
title = "Building text classifiers using positive and unlabeled
examples",
author = "Bing Liu and Yang Dai and Xiaoli Li and Wee Sun Lee and
Philip S. Yu",
journal = "Third IEEE International Conference on Data Mining",
year = 2003,
pages = "179-186"
}
@InProceedings{li-2005-pu-learning,
author = "Li, Xiao-Li and Liu, Bing",
editor = "Gama, Jo{\~a}o and Camacho, Rui and Brazdil, Pavel B. and
Jorge, Al{\'i}pio M{\'a}rio and Torgo, Lu{\'i}s",
title = "Learning from Positive and Unlabeled Examples with Different
Data Distributions",
booktitle = "Machine Learning: ECML 2005",
year = 2005,
publisher = "Springer Berlin Heidelberg",
address = "Berlin, Heidelberg",
pages = "218-229",
abstract = "We study the problem of learning from positive and unlabeled
examples. Although several techniques exist for dealing with
this problem, they all assume that positive examples in the
positive set P and the positive examples in the unlabeled set
U are generated from the same distribution. This assumption
may be violated in practice. For example, one wants to
collect all printer pages from the Web. One can use the
printer pages from one site as the set P of positive pages
and use product pages from another site as U. One wants to
classify the pages in U into printer pages and non-printer
pages. Although printer pages from the two sites have many
similarities, they can also be quite different because
different sites often present similar products in different
styles and have different focuses. In such cases, existing
methods perform poorly. This paper proposes a novel technique
A-EM to deal with the problem. Experiment results with
product page classification demonstrate the effectiveness of
the proposed technique.",
isbn = "978-3-540-31692-3"
}
@inproceedings{liu-2002-partial-super,
author = "Liu, Bing and Lee, Wee Sun and Yu, Philip S. and Li, Xiaoli",
title = "Partially Supervised Classification of Text Documents",
booktitle = "Proceedings of the Nineteenth International Conference on
Machine Learning",
series = "ICML '02",
year = 2002,
isbn = "1-55860-873-7",
pages = "387-394",
numpages = 8,
url = "http://dl.acm.org/citation.cfm?id=645531.656022",
acmid = 656022,
publisher = "Morgan Kaufmann Publishers Inc.",
address = "San Francisco, CA, USA"
}
@inproceedings{wilson-2005-recog-contex,
author = "Wilson, Theresa and Wiebe, Janyce and Hoffmann, Paul",
title = "Recognizing Contextual Polarity in Phrase-level Sentiment
Analysis",
booktitle = "Proceedings of the Conference on Human Language Technology
and Empirical Methods in Natural Language Processing",
series = "HLT '05",
year = 2005,
location = "Vancouver, British Columbia, Canada",
pages = "347-354",
numpages = 8,
url = "https://doi.org/10.3115/1220575.1220619",
doi = "10.3115/1220575.1220619",
acmid = 1220619,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@INPROCEEDINGS{liu-2010-sentim,
author = "Bing Liu",
title = "Sentiment analysis and subjectivity",
booktitle = "Handbook of Natural Language Processing, Second
Edition. Taylor and Francis Group, Boca",
year = 2010
}
@Inbook{liu-2012-survey-opinion,
author = "Liu, Bing and Zhang, Lei",
title = "A Survey of Opinion Mining and Sentiment Analysis",
bookTitle = "Mining Text Data",
year = 2012,
publisher = "Springer US",
address = "Boston, MA",
pages = "415-463",
chapter = 1,
abstract = "Sentiment analysis or opinion mining is the computational
study of people's opinions, appraisals, attitudes, and
emotions toward entities, individuals, issues, events, topics
and their attributes. The task is technically challenging and
practically very useful. For example, businesses always want
to find public or consumer opinions about their products and
services. Potential customers also want to know the opinions
of existing users before they use a service or purchase a
product.",
isbn = "978-1-4614-3223-4",
doi = "10.1007/978-1-4614-3223-4_13",
url = "https://doi.org/10.1007/978-1-4614-3223-4_13"
}
@InProceedings{conneau-2018-xnli,
author = "Conneau, Alexis and Rinott, Ruty and Lample, Guillaume and
Williams, Adina and Bowman, Samuel R. and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
year = 2018,
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium"
}
@article{lample-2019-xlms,
author = "Guillaume Lample and Alexis Conneau",
title = "Cross-lingual Language Model Pretraining",
journal = "CoRR",
volume = "abs/1901.07291",
year = 2019,
url = "http://arxiv.org/abs/1901.07291",
archivePrefix= "arXiv",
eprint = "1901.07291",
timestamp = "Fri, 01 Feb 2019 13:39:59 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1901-07291",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{schuster-2012-japan-korean,
title = "Japanese and korean voice search",
author = "Schuster, Mike and Nakajima, Kaisuke",
booktitle = "2012 IEEE International Conference on Acoustics, Speech and
Signal Processing (ICASSP)",
pages = "5149-5152",
year = 2012,
organization = "IEEE"
}
@article{shaw-2018-self-atten,
author = "Peter Shaw and Jakob Uszkoreit and Ashish Vaswani",
title = "Self-Attention with Relative Position Representations",
journal = "CoRR",
volume = "abs/1803.02155",
year = 2018,
url = "http://arxiv.org/abs/1803.02155",
archivePrefix= "arXiv",
eprint = "1803.02155",
timestamp = "Mon, 13 Aug 2018 16:46:37 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1803-02155",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{al-rfou-2018-charac-level,
author = "Rami Al{-}Rfou and Dokook Choe and Noah Constant and Mandy
Guo and Llion Jones",
title = "Character-Level Language Modeling with Deeper Self-Attention",
journal = "CoRR",
volume = "abs/1808.04444",
year = 2018,
url = "http://arxiv.org/abs/1808.04444",
archivePrefix= "arXiv",
eprint = "1808.04444",
timestamp = "Sun, 02 Sep 2018 15:01:55 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1808-04444",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{fan-2017-matchzoo,
author = "Yixing Fan and Liang Pang and Jianpeng Hou and Jiafeng Guo
and Yanyan Lan and Xueqi Cheng",
title = "MatchZoo: {A} Toolkit for Deep Text Matching",
journal = "CoRR",
volume = "abs/1707.07270",
year = 2017,
url = "http://arxiv.org/abs/1707.07270",
archivePrefix= "arXiv",
eprint = "1707.07270",
timestamp = "Mon, 13 Aug 2018 16:48:14 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/FanPHGLC17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{wang-2018-ripplenet,
author = "Hongwei Wang and Fuzheng Zhang and Jialin Wang and Miao Zhao
and Wenjie Li and Xing Xie and Minyi Guo",
title = "Ripple Network: Propagating User Preferences on the Knowledge
Graph for Recommender Systems",
journal = "CoRR",
volume = "abs/1803.03467",
year = 2018,
url = "http://arxiv.org/abs/1803.03467",
archivePrefix= "arXiv",
eprint = "1803.03467",
timestamp = "Mon, 13 Aug 2018 16:48:19 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1803-03467",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{song-2019-mass,
author = "Kaitao Song and Xu Tan and Tao Qin and Jianfeng Lu and
Tie{-}Yan Liu",
title = "{MASS:} Masked Sequence to Sequence Pre-training for Language
Generation",
journal = "CoRR",
volume = "abs/1905.02450",
year = 2019,
url = "http://arxiv.org/abs/1905.02450",
archivePrefix= "arXiv",
eprint = "1905.02450",
timestamp = "Mon, 27 May 2019 13:15:00 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1905-02450",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{huang-2013-dssm,
title = "Learning deep structured semantic models for web search using
clickthrough data",
author = "Huang, Po-Sen and He, Xiaodong and Gao, Jianfeng and Deng, Li
and Acero, Alex and Heck, Larry",
booktitle = "Proceedings of the 22nd ACM international conference on
Information \& Knowledge Management",
pages = "2333-2338",
year = 2013,
organization = "ACM"
}
@InProceedings{shen-2014-cnn-dssm,
author = "Shen, Yelong and He, Xiaodong and Gao, Jianfeng and Deng, Li
and Mesnil, Gregoire",
title = "A Latent Semantic Model with Convolutional-Pooling Structure
for Information Retrieval",
booktitle = "CIKM",
year = 2014,
month = "November",
abstract = "In this paper, we propose a new latent semantic model that
incorporates a convolutional-pooling structure over word
sequences to learn low-dimensional, semantic vector
representations for search queries and Web documents. In
order to capture the rich contextual structures in a query or
a document, we start with each word within a temporal context
window in a word sequence to directly capture contextual
features at the word n-gram level. Next, the salient word
n-gram features in the word sequence are discovered by the
model and are then aggregated to form a sentence-level
feature vector. Finally, a non-linear transformation is
applied to extract high-level semantic information to
generate a continuous vector representation for the full text
string. The proposed convolutional latent semantic model
(CLSM) is trained on clickthrough data and is evaluated on a
Web document ranking task using a large-scale, real-world
data set. Results show that the proposed model effectively
captures salient semantic information in queries and
documents for the task while significantly outperforming
previous state-of-the-art semantic models.",
url =
"https://www.microsoft.com/en-us/research/publication/a-latent-semantic-model-with-convolutional-pooling-structure-for-information-retrieval/"
}
@article{palangi-2014-lstm-dssm,
title = "Semantic modelling with long-short-term memory for
information retrieval",
author = "Palangi, Hamid and Deng, Li and Shen, Yelong and Gao,
Jianfeng and He, Xiaodong and Chen, Jianshu and Song, Xinying
and Ward, R",
journal = "arXiv preprint arXiv:1412.6629",
year = 2014
}
@inproceedings{elkahky-2015-mv-dssm,
title = "A multi-view deep learning approach for cross domain user
modeling in recommendation systems",
author = "Elkahky, Ali Mamdouh and Song, Yang and He, Xiaodong",
booktitle = "Proceedings of the 24th International Conference on World
Wide Web",
pages = "278-288",
year = 2015,
organization = "International World Wide Web Conferences Steering Committee"
}
@inproceedings{qiu-2015-cntn,
title = "Convolutional neural tensor network architecture for
community-based question answering",
author = "Qiu, Xipeng and Huang, Xuanjing",
booktitle = "Twenty-Fourth International Joint Conference on Artificial
Intelligence",
year = 2015
}
@article{庞亮-2017-深度文本匹配综述,
title = "深度文本匹配综述",
author = "庞亮 and 兰艳艳 and 徐君 and 郭嘉丰 and 万圣贤 and 程学旗",
journal = "计算机学报",
volume = 40,
number = 4,
pages = "985-1003",
year = 2017
}
@INPROCEEDINGS{chopra-2005-siamese,
author = "S. {Chopra} and R. {Hadsell} and Y. {LeCun}",
booktitle = "2005 IEEE Computer Society Conference on Computer Vision and
Pattern Recognition (CVPR'05)",
title = "Learning a similarity metric discriminatively, with
application to face verification",
year = 2005,
volume = 1,
pages = "539-546 vol. 1",
keywords = "face recognition;learning (artificial
intelligence);similarity metric learning;face
verification;face recognition;L/sub 1/ norm;semantic distance
approximation;discriminative loss function;geometric
distortion;Character generation;Drives;Robustness;System
testing;Spatial databases;Glass;Artificial neural
networks;Support vector machines;Support vector machine
classification;Face recognition",
doi = "10.1109/CVPR.2005.202",
month = "June"
}
@inproceedings{zhai-2016-deepintent,
author = "Zhai, Shuangfei and Chang, Keng-hao and Zhang, Ruofei and
Zhang, Zhongfei Mark",
title = "Deepintent: Learning attentions for online advertising with
recurrent neural networks",
booktitle = "Proceedings of the 22nd ACM SIGKDD international conference
on knowledge discovery and data mining",
year = 2016,
pages = "1295-1304",
organization = "ACM"
}
@inproceedings{mitra-2017-learn-to-match,
author = "Mitra, Bhaskar and Diaz, Fernando and Craswell, Nick",
title = "Learning to match using local and distributed representations
of text for web search",
booktitle = "Proceedings of the 26th International Conference on World
Wide Web",
year = 2017,
pages = "1291-1299",
organization = "International World Wide Web Conferences Steering Committee"
}
@inproceedings{tan-2016-improve,
title = "Improved representation learning for question answer
matching",
author = "Tan, Ming and Dos Santos, Cicero and Xiang, Bing and Zhou,
Bowen",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
pages = "464-473",
year = 2016
}
@incollection{hu-2014-arc-i,
title = "Convolutional Neural Network Architectures for Matching
Natural Language Sentences",
author = "Hu, Baotian and Lu, Zhengdong and Li, Hang and Chen, Qingcai",
booktitle = "Advances in Neural Information Processing Systems 27",
editor = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
and K. Q. Weinberger",
pages = "2042-2050",
year = 2014,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/5550-convolutional-neural-network-architectures-for-matching-natural-language-sentences.pdf"
}
@inproceedings{yin-2015-multigrancnn,
title = "Multigrancnn: An architecture for general matching of text
chunks on multiple levels of granularity",
author = "Yin, Wenpeng and Sch{\"u}tze, Hinrich",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics and the 7th International Joint
Conference on Natural Language Processing (Volume 1: Long
Papers)",
pages = "63-73",
year = 2015
}
@article{pang-2016-matchpyramid,
author = "Liang Pang and Yanyan Lan and Jiafeng Guo and Jun Xu and
Shengxian Wan and Xueqi Cheng",
title = "Text Matching as Image Recognition",
journal = "CoRR",
volume = "abs/1602.06359",
year = 2016,
url = "http://arxiv.org/abs/1602.06359",
archivePrefix= "arXiv",
eprint = "1602.06359",
timestamp = "Mon, 13 Aug 2018 16:47:25 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/PangLGXWC16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@incollection{lu-2013-deepmatch,
title = "A Deep Architecture for Matching Short Texts",
author = "Lu, Zhengdong and Li, Hang",
booktitle = "Advances in Neural Information Processing Systems 26",
editor = "C. J. C. Burges and L. Bottou and M. Welling and
Z. Ghahramani and K. Q. Weinberger",
pages = "1367-1375",
year = 2013,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/5019-a-deep-architecture-for-matching-short-texts.pdf"
}
@inproceedings{zhang-2017-aicnn,
title = "Attentive interactive neural networks for answer selection in
community question answering",
author = "Zhang, Xiaodong and Li, Sujian and Sha, Lei and Wang,
Houfeng",
booktitle = "Thirty-First AAAI Conference on Artificial Intelligence",
year = 2017
}
@inproceedings{sha-2018-mvfnn,
title = "A multi-view fusion neural network for answer selection",
author = "Sha, Lei and Zhang, Xiaodong and Qian, Feng and Chang, Baobao
and Sui, Zhifang",
booktitle = "Thirty-Second AAAI Conference on Artificial Intelligence",
year = 2018
}
@inproceedings{zhang-2018-dqi,
title = "Duplicate question identification by integrating framenet
with neural networks",
author = "Zhang, Xiaodong and Sun, Xu and Wang, Houfeng",
booktitle = "Thirty-Second AAAI Conference on Artificial Intelligence",
year = 2018
}
@article{wan-2016-match-srnn,
author = "Shengxian Wan and Yanyan Lan and Jun Xu and Jiafeng Guo and
Liang Pang and Xueqi Cheng",
title = "Match-SRNN: Modeling the Recursive Matching Structure with
Spatial {RNN}",
journal = "CoRR",
volume = "abs/1604.04378",
year = 2016,
url = "http://arxiv.org/abs/1604.04378",
archivePrefix= "arXiv",
eprint = "1604.04378",
timestamp = "Mon, 13 Aug 2018 16:47:12 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/WanLXGPC16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{tan-2015-qa-lstm,
author = "Ming Tan and Bing Xiang and Bowen Zhou",
title = "LSTM-based Deep Learning Models for non-factoid answer
selection",
journal = "CoRR",
volume = "abs/1511.04108",
year = 2015,
url = "http://arxiv.org/abs/1511.04108",
archivePrefix= "arXiv",
eprint = "1511.04108",
timestamp = "Mon, 13 Aug 2018 16:46:33 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/TanXZ15",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{xiong-2017-k-nrm,
author = "Chenyan Xiong and Zhuyun Dai and Jamie Callan and Zhiyuan Liu
and Russell Power",
title = "End-to-End Neural Ad-hoc Ranking with Kernel Pooling",
journal = "CoRR",
volume = "abs/1706.06613",
year = 2017,
url = "http://arxiv.org/abs/1706.06613",
archivePrefix= "arXiv",
eprint = "1706.06613",
timestamp = "Mon, 13 Aug 2018 16:49:10 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/XiongDCLP17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{chen-2018-mix,
title = "Mix: Multi-channel information crossing for text matching",
author = "Chen, Haolan and Han, Fred X and Niu, Di and Liu, Dong and
Lai, Kunfeng and Wu, Chenglin and Xu, Yu",
booktitle = "Proceedings of the 24th ACM SIGKDD International Conference
on Knowledge Discovery \& Data Mining",
pages = "110-119",
year = 2018,
organization = "ACM"
}
@inproceedings{zhang-2003-quest-class,
author = "Zhang, Dell and Lee, Wee Sun",
title = "Question Classification Using Support Vector Machines",
booktitle = "Proceedings of the 26th Annual International ACM SIGIR
Conference on Research and Development in Informaion
Retrieval",
series = "SIGIR '03",
year = 2003,
isbn = "1-58113-646-3",
location = "Toronto, Canada",
pages = "26-32",
numpages = 7,
url = "http://doi.acm.org/10.1145/860435.860443",
doi = "10.1145/860435.860443",
acmid = 860443,
publisher = "ACM",
address = "New York, NY, USA",
keywords = "kernel method, machine learning, question answering, support
vector machine, text classification"
}
@inproceedings{li-2002-learn-quest-class,
author = "Li, Xin and Roth, Dan",
title = "Learning Question Classifiers",
booktitle = "Proceedings of the 19th International Conference on
Computational Linguistics - Volume 1",
series = "COLING '02",
year = 2002,
location = "Taipei, Taiwan",
pages = "1-7",
numpages = 7,
url = "https://doi.org/10.3115/1072228.1072378",
doi = "10.3115/1072228.1072378",
acmid = 1072378,
publisher = "Association for Computational Linguistics",
address = "Stroudsburg, PA, USA"
}
@inproceedings{cui-2004-unsup,
title = "Unsupervised learning of soft patterns for generating
definitions from online news",
author = "Cui, Hang and Kan, Min-Yen and Chua, Tat-Seng",
booktitle = "Proceedings of the 13th international conference on World
Wide Web",
pages = "90-99",
year = 2004,
organization = "ACM"
}
@inproceedings{unger-2012-template-based,
title = "Template-based question answering over RDF data",
author = "Unger, Christina and B{\"u}hmann, Lorenz and Lehmann, Jens
and Ngonga Ngomo, Axel-Cyrille and Gerber, Daniel and
Cimiano, Philipp",
booktitle = "Proceedings of the 21st international conference on World
Wide Web",
pages = "639-648",
year = 2012,
organization = "ACM"
}
@inproceedings{abujabal-2017-autom-templ,
author = "Abujabal, Abdalghani and Yahya, Mohamed and Riedewald, Mirek
and Weikum, Gerhard",
title = "Automated Template Generation for Question Answering over
Knowledge Graphs",
booktitle = "Proceedings of the 26th International Conference on World
Wide Web",
series = "WWW '17",
year = 2017,
isbn = "978-1-4503-4913-0",
location = "Perth, Australia",
pages = "1191-1200",
numpages = 10,
url = "https://doi.org/10.1145/3038912.3052583",
doi = "10.1145/3038912.3052583",
acmid = 3052583,
publisher = "International World Wide Web Conferences Steering Committee",
address = "Republic and Canton of Geneva, Switzerland",
keywords = "knowledge graphs, question answering, semantic parsing"
}
@inproceedings{riedel-2010-model,
title = "Modeling relations and their mentions without labeled text",
author = "Riedel, Sebastian and Yao, Limin and McCallum, Andrew",
booktitle = "Joint European Conference on Machine Learning and Knowledge
Discovery in Databases",
pages = "148-163",
year = 2010,
organization = "Springer"
}
@inproceedings{liu-2017-soft-label,
title = "A soft-label method for noise-tolerant distantly supervised
relation extraction",
author = "Liu, Tianyu and Wang, Kexiang and Chang, Baobao and Sui,
Zhifang",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
pages = "1790-1795",
year = 2017
}
@article{feng-2018-reinf-learn,
author = "Jun Feng and Minlie Huang and Li Zhao and Yang Yang and
Xiaoyan Zhu",
title = "Reinforcement Learning for Relation Classification from Noisy
Data",
journal = "CoRR",
volume = "abs/1808.08013",
year = 2018,
url = "http://arxiv.org/abs/1808.08013",
archivePrefix= "arXiv",
eprint = "1808.08013",
timestamp = "Tue, 03 Sep 2019 20:11:19 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1808-08013",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@phdthesis{zhang-2015-deepdive,
title = "DeepDive: A Data Management System for Automatic Knowledge
Base Construction",
author = "Zhang, Ce",
year = 2015,
school = "UW-Madison"
}
@inproceedings{yao-2014-infor-extrac-struc-data,
title = "Information Extraction over Structured Data: Question
Answering with {F}reebase",
author = "Yao, Xuchen and Van Durme, Benjamin",
booktitle = "Proceedings of the 52nd Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jun,
year = 2014,
address = "Baltimore, Maryland",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P14-1090",
doi = "10.3115/v1/P14-1090",
pages = "956-966"
}
@article{bordes-2014-quest-answer-subgr-embed,
author = "Antoine Bordes and Sumit Chopra and Jason Weston",
title = "Question Answering with Subgraph Embeddings",
journal = "CoRR",
volume = "abs/1406.3676",
year = 2014,
url = "http://arxiv.org/abs/1406.3676",
archivePrefix= "arXiv",
eprint = "1406.3676",
timestamp = "Mon, 13 Aug 2018 16:46:20 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/BordesCW14",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{dong-2015-quest,
title = "Question answering over freebase with multi-column
convolutional neural networks",
author = "Dong, Li and Wei, Furu and Zhou, Ming and Xu, Ke",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics and the 7th International Joint
Conference on Natural Language Processing (Volume 1: Long
Papers)",
pages = "260-269",
year = 2015
}
@inproceedings{yih-2015-query-graph,
title = "Semantic Parsing via Staged Query Graph Generation: Question
Answering with Knowledge Base",
author = "Yih, Wen-tau and Chang, Ming-Wei and He, Xiaodong and Gao,
Jianfeng",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics and the 7th International Joint
Conference on Natural Language Processing (Volume 1: Long
Papers)",
pages = "1321-1331",
year = 2015
}
@article{chen-2017-drqa,
author = "Danqi Chen and Adam Fisch and Jason Weston and Antoine
Bordes",
title = "Reading Wikipedia to Answer Open-Domain Questions",
journal = "CoRR",
volume = "abs/1704.00051",
year = 2017,
url = "http://arxiv.org/abs/1704.00051",
archivePrefix= "arXiv",
eprint = "1704.00051",
timestamp = "Mon, 13 Aug 2018 16:47:17 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ChenFWB17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{miwa-2014-model,
title = "Modeling joint entity and relation extraction with table
representation",
author = "Miwa, Makoto and Sasaki, Yutaka",
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in
Natural Language Processing (EMNLP)",
pages = "1858-1869",
year = 2014
}
@article{zheng-2017-joint-extrac,
author = "Suncong Zheng and Feng Wang and Hongyun Bao and Yuexing Hao
and Peng Zhou and Bo Xu",
title = "Joint Extraction of Entities and Relations Based on a Novel
Tagging Scheme",
journal = "CoRR",
volume = "abs/1706.05075",
year = 2017,
url = "http://arxiv.org/abs/1706.05075",
archivePrefix= "arXiv",
eprint = "1706.05075",
timestamp = "Tue, 25 Jun 2019 17:27:14 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ZhengWBHZX17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{luo-2017-learn-noise,
author = "Bingfeng Luo and Yansong Feng and Zheng Wang and Zhanxing Zhu
and Songfang Huang and Rui Yan and Dongyan Zhao",
title = "Learning with Noise: Enhance Distantly Supervised Relation
Extraction with Dynamic Transition Matrix",
journal = "CoRR",
volume = "abs/1705.03995",
year = 2017,
url = "http://arxiv.org/abs/1705.03995",
archivePrefix= "arXiv",
eprint = "1705.03995",
timestamp = "Sat, 31 Aug 2019 16:23:05 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/LuoFWZHYZ17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{feng-2017-effec-deep,
author = "Xiaocheng Feng and Jiang Guo and Bing Qin and Ting Liu and
Yongjie Liu",
title = "Effective Deep Memory Networks for Distant Supervised
Relation Extraction",
booktitle = "Proceedings of the Twenty-Sixth International Joint
Conference on Artificial Intelligence, {IJCAI-17}",
pages = "4002-4008",
year = 2017,
doi = "10.24963/ijcai.2017/559",
url = "https://doi.org/10.24963/ijcai.2017/559"
}
@inproceedings{bordes-2013-transe,
title = "Translating embeddings for modeling multi-relational data",
author = "Bordes, Antoine and Usunier, Nicolas and Garcia-Duran,
Alberto and Weston, Jason and Yakhnenko, Oksana",
booktitle = "Advances in neural information processing systems",
pages = "2787-2795",
year = 2013
}
@inproceedings{wang-2014-transh,
title = "Knowledge graph embedding by translating on hyperplanes",
author = "Wang, Zhen and Zhang, Jianwen and Feng, Jianlin and Chen,
Zheng",
booktitle = "Twenty-Eighth AAAI conference on artificial intelligence",
year = 2014
}
@inproceedings{lin-2015-transr,
title = "Learning entity and relation embeddings for knowledge graph
completion",
author = "Lin, Yankai and Liu, Zhiyuan and Sun, Maosong and Liu, Yang
and Zhu, Xuan",
booktitle = "Proceedings of the Twenty-Ninth AAAI Conference on Artificial
Intelligence",
pages = "2181-2187",
year = 2015,
organization = "AAAI Press"
}
@inproceedings{ji-2015-transd,
title = "Knowledge graph embedding via dynamic mapping matrix",
author = "Ji, Guoliang and He, Shizhu and Xu, Liheng and Liu, Kang and
Zhao, Jun",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics and the 7th International Joint
Conference on Natural Language Processing (Volume 1: Long
Papers)",
pages = "687-696",
year = 2015
}
@article{xiao-2015-transa,
author = "Han Xiao and Minlie Huang and Yu Hao and Xiaoyan Zhu",
title = "TransA: An Adaptive Approach for Knowledge Graph Embedding",
journal = "CoRR",
volume = "abs/1509.05490",
year = 2015,
url = "http://arxiv.org/abs/1509.05490",
archivePrefix= "arXiv",
eprint = "1509.05490",
timestamp = "Tue, 03 Sep 2019 20:11:19 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/0005HHZ15a",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{ji-2016-transparse,
title = "Knowledge graph completion with adaptive sparse transfer
matrix",
author = "Ji, Guoliang and Liu, Kang and He, Shizhu and Zhao, Jun",
booktitle = "Thirtieth AAAI Conference on Artificial Intelligence",
year = 2016
}
@inproceedings{xiao-2016-transg,
title = "TransG: A generative model for knowledge graph embedding",
author = "Xiao, Han and Huang, Minlie and Zhu, Xiaoyan",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
volume = 1,
pages = "2316-2325",
year = 2016
}
@inproceedings{he-2015-kg2e,
title = "Learning to represent knowledge graphs with gaussian
embedding",
author = "He, Shizhu and Liu, Kang and Ji, Guoliang and Zhao, Jun",
booktitle = "Proceedings of the 24th ACM International on Conference on
Information and Knowledge Management",
pages = "623-632",
year = 2015,
organization = "ACM"
}
@inproceedings{jia-2016-transa+,
title = "Locally adaptive translation for knowledge graph embedding",
author = "Jia, Yantao and Wang, Yuanzhuo and Lin, Hailun and Jin,
Xiaolong and Cheng, Xueqi",
booktitle = "Thirtieth AAAI conference on artificial intelligence",
year = 2016
}
@inproceedings{shi-2017-proje,
title = "ProjE: Embedding projection for knowledge graph completion",
author = "Shi, Baoxu and Weninger, Tim",
booktitle = "Thirty-First AAAI Conference on Artificial Intelligence",
year = 2017
}
@inproceedings{krompass-2015-type,
title = "Type-constrained representation learning in knowledge graphs",
author = "Krompa{\ss}, Denis and Baier, Stephan and Tresp, Volker",
booktitle = "International semantic web conference",
pages = "640-655",
year = 2015,
organization = "Springer"
}
@inproceedings{niu-2011-zhishi,
title = "Zhishi.me-weaving chinese linking open data",
author = "Niu, Xing and Sun, Xinruo and Wang, Haofen and Rong, Shu and
Qi, Guilin and Yu, Yong",
booktitle = "International Semantic Web Conference",
pages = "205-220",
year = 2011,
organization = "Springer"
}
@incollection{bizer-2011-linked,
title = "Linked data: The story so far",
author = "Bizer, Christian and Heath, Tom and Berners-Lee, Tim",
booktitle = "Semantic services, interoperability and web applications:
emerging concepts",
pages = "205-227",
year = 2011,
publisher = "IGI Global"
}
@inproceedings{liu-2017-unsup,
title = "Unsupervised image-to-image translation networks",
author = "Liu, Ming-Yu and Breuel, Thomas and Kautz, Jan",
booktitle = "Advances in neural information processing systems",
pages = "700-708",
year = 2017
}
@inproceedings{cao-2018-cw2vec,
title = "cw2vec: Learning chinese word embeddings with stroke n-gram
information",
author = "Cao, Shaosheng and Lu, Wei and Zhou, Jun and Li, Xiaolong",
booktitle = "Thirty-Second AAAI Conference on Artificial Intelligence",
year = 2018
}
@ARTICLE{yu-2015-multi-scale,
author = "{Yu}, Fisher and {Koltun}, Vladlen",
title = "{Multi-Scale Context Aggregation by Dilated Convolutions}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition",
year = 2015,
month = "Nov",
eid = "arXiv:1511.07122",
pages = "arXiv:1511.07122",
archivePrefix= "arXiv",
eprint = "1511.07122",
primaryClass = "cs.CV",
adsurl = "https://ui.adsabs.harvard.edu/abs/2015arXiv151107122Y",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{chiu-2016-lstm-cnn,
title = "Named Entity Recognition with Bidirectional {LSTM}-{CNN}s",
author = "Chiu, Jason P.C. and Nichols, Eric",
journal = "Transactions of the Association for Computational
Linguistics",
volume = 4,
year = 2016,
url = "https://www.aclweb.org/anthology/Q16-1026",
doi = "10.1162/tacl_a_00104",
pages = "357-370",
abstract = "Named entity recognition is a challenging task that has
traditionally required large amounts of knowledge in the form
of feature engineering and lexicons to achieve high
performance. In this paper, we present a novel neural network
architecture that automatically detects word- and
character-level features using a hybrid bidirectional LSTM
and CNN architecture, eliminating the need for most feature
engineering. We also propose a novel method of encoding
partial lexicon matches in neural networks and compare it to
existing approaches. Extensive evaluation shows that, given
only tokenized text and publicly available word embeddings,
our system is competitive on the CoNLL-2003 dataset and
surpasses the previously reported state of the art
performance on the OntoNotes 5.0 dataset by 2.13 F1
points. By using two lexicons constructed from
publicly-available sources, we establish new state of the art
performance with an F1 score of 91.62 on CoNLL-2003 and 86.28
on OntoNotes, surpassing systems that employ heavy feature
engineering, proprietary lexicons, and rich entity linking
information."
}
@inproceedings{zhang-2018-lattice-lstm,
title = "{C}hinese {NER} Using Lattice {LSTM}",
author = "Zhang, Yue and Yang, Jie",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1144",
doi = "10.18653/v1/P18-1144",
pages = "1554-1564",
abstract = "We investigate a lattice-structured LSTM model for Chinese
NER, which encodes a sequence of input characters as well as
all potential words that match a lexicon. Compared with
character-based methods, our model explicitly leverages word
and word sequence information. Compared with word-based
methods, lattice LSTM does not suffer from segmentation
errors. Gated recurrent cells allow our model to choose the
most relevant characters and words from a sentence for better
NER results. Experiments on various datasets show that
lattice LSTM outperforms both word-based and character-based
LSTM baselines, achieving the best results."
}
@article{shang-2018-autoner,
author = "Jingbo Shang and Liyuan Liu and Xiang Ren and Xiaotao Gu and
Teng Ren and Jiawei Han",
title = "Learning Named Entity Tagger using Domain-Specific
Dictionary",
journal = "CoRR",
volume = "abs/1809.03599",
year = 2018,
url = "http://arxiv.org/abs/1809.03599",
archivePrefix= "arXiv",
eprint = "1809.03599",
timestamp = "Fri, 05 Oct 2018 11:34:52 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1809-03599",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{yadav-2018-survey-ner,
title = "A Survey on Recent Advances in Named Entity Recognition from
Deep Learning models",
author = "Yadav, Vikas and Bethard, Steven",
booktitle = "Proceedings of the 27th International Conference on
Computational Linguistics",
month = aug,
year = 2018,
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-1182",
pages = "2145-2158",
abstract = "Named Entity Recognition (NER) is a key component in NLP
systems for question answering, information retrieval,
relation extraction, etc. NER systems have been studied and
developed widely for decades, but accurate systems using deep
neural networks (NN) have only been introduced in the last
few years. We present a comprehensive survey of deep neural
network architectures for NER, and contrast them with
previous approaches to NER based on feature engineering and
other supervised or semi-supervised learning algorithms. Our
results highlight the improvements achieved by neural
networks, and show how incorporating some of the lessons
learned from past work on feature-based NER systems can yield
further improvements."
}
@ARTICLE{li-2016-webqa,
author = "{Li}, Peng and {Li}, Wei and {He}, Zhengyan and {Wang},
Xuguang and {Cao}, Ying and {Zhou}, Jie and {Xu}, Wei",
title = "{Dataset and Neural Recurrent Sequence Labeling Model for
Open-Domain Factoid Question Answering}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Neural and Evolutionary
Computing",
year = 2016,
month = "Jul",
eid = "arXiv:1607.06275",
pages = "arXiv:1607.06275",
archivePrefix= "arXiv",
eprint = "1607.06275",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160706275L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{wang-2003-risk-score,
author = "Wang, Thomas J. and Massaro, Joseph M. and Levy, Daniel and
Vasan, Ramachandran S. and Wolf, Philip A. and D'Agostino,
Ralph B. and Larson, Martin G. and Kannel, William B. and
Benjamin, Emelia J.",
title = "{A Risk Score for Predicting Stroke or Death in Individuals
With New-Onset Atrial Fibrillation in the CommunityThe
Framingham Heart Study}",
journal = "JAMA",
volume = 290,
number = 8,
pages = "1049-1056",
year = 2003,
month = 08,
abstract = "{ContextPrior risk stratification schemes for atrial
fibrillation (AF) have been based on randomized trial cohorts
or Medicare administrative databases, have included patients
with established AF, and have focused on stroke as the
principal outcome.ObjectiveTo derive risk scores for stroke
alone and stroke or death in community-based individuals with
new-onset AF.Design, Setting, and ParticipantsProspective,
community-based, observational cohort in Framingham, Mass.
We identified 868 participants with new-onset AF, 705 of whom
were not treated with warfarin at baseline. Risk scores for
stroke (ischemic or hemorrhagic) and stroke or death were
developed with censoring when warfarin initiation occurred
during follow-up. Event rates were examined in low-risk
individuals, as defined by the risk score and 4 previously
published risk schemes.Main Outcome MeasuresStroke and the
combination of stroke or death.ResultsDuring a mean follow-up
of 4.0 years free of warfarin use, stroke alone occurred in
83 participants and stroke or death occurred in 382
participants. A risk score for stroke was derived that
included the following risk predictors: advancing age, female
sex, increasing systolic blood pressure, prior stroke or
transient ischemic attack, and diabetes. With the risk score,
14.3\\% of the cohort had a predicted 5-year stroke rate
≤7.5\\% (average annual rate ≤1.5\\%), and 30.6\\% of the
cohort had a predicted 5-year stroke rate ≤10\\% (average
annual rate ≤2\\%). Actual stroke rates in these low-risk
groups were 1.1 and 1.5 per 100 person-years,
respectively. Previous risk schemes classified 6.4\\% to
17.3\\% of subjects as low risk, with actual stroke rates of
0.9 to 2.3 per 100 person-years. A risk score for stroke or
death is also presented.ConclusionThese risk scores can be
used to estimate the absolute risk of an adverse event in
individuals with AF, which may be helpful in counseling
patients and making treatment decisions.}",
issn = "0098-7484",
doi = "10.1001/jama.290.8.1049",
url = "https://doi.org/10.1001/jama.290.8.1049",
eprint =
"https://jamanetwork.com/journals/jama/articlepdf/197176/joc30626.pdf"
}
@inproceedings{khosla-2010-integrated,
title = "An integrated machine learning approach to stroke prediction",
author = "Khosla, Aditya and Cao, Yu and Lin, Cliff Chiung-Yu and Chiu,
Hsu-Kuang and Hu, Junling and Lee, Honglak",
booktitle = "Proceedings of the 16th ACM SIGKDD international conference
on Knowledge discovery and data mining",
pages = "183-192",
year = 2010,
organization = "ACM"
}
@inproceedings{cheng-2016-risk,
title = "Risk prediction with electronic health records: A deep
learning approach",
author = "Cheng, Yu and Wang, Fei and Zhang, Ping and Hu, Jianying",
booktitle = "Proceedings of the 2016 SIAM International Conference on Data
Mining",
pages = "432-440",
year = 2016,
organization = "SIAM"
}
@article{choi-2016-using,
title = "Using recurrent neural network models for early detection of
heart failure onset",
author = "Choi, Edward and Schuetz, Andy and Stewart, Walter F and Sun,
Jimeng",
journal = "Journal of the American Medical Informatics Association",
volume = 24,
number = 2,
pages = "361-370",
year = 2016,
publisher = "Oxford University Press"
}
@article{rajkomar-2018-scalable,
title = "Scalable and accurate deep learning with electronic health
records",
author = "Rajkomar, Alvin and Oren, Eyal and Chen, Kai and Dai, Andrew
M and Hajaj, Nissan and Hardt, Michaela and Liu, Peter J and
Liu, Xiaobing and Marcus, Jake and Sun, Mimi and others",
journal = "NPJ Digital Medicine",
volume = 1,
number = 1,
pages = 18,
year = 2018,
publisher = "Nature Publishing Group"
}
@article{shickel-2017-deep-ehr,
author = "Benjamin Shickel and Patrick Tighe and Azra Bihorac and
Parisa Rashidi",
title = "Deep {EHR:} {A} Survey of Recent Advances on Deep Learning
Techniques for Electronic Health Record {(EHR)} Analysis",
journal = "CoRR",
volume = "abs/1706.03446",
year = 2017,
url = "http://arxiv.org/abs/1706.03446",
archivePrefix= "arXiv",
eprint = "1706.03446",
timestamp = "Mon, 13 Aug 2018 16:46:19 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ShickelTBR17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{chen-2016-enhan-combin,
author = "Qian Chen and Xiaodan Zhu and Zhen{-}Hua Ling and Si Wei and
Hui Jiang",
title = "Enhancing and Combining Sequential and Tree {LSTM} for
Natural Language Inference",
journal = "CoRR",
volume = "abs/1609.06038",
year = 2016,
url = "http://arxiv.org/abs/1609.06038",
archivePrefix= "arXiv",
eprint = "1609.06038",
timestamp = "Mon, 13 Aug 2018 16:48:17 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ChenZLWJ16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{neculoiu-2016-learn-text,
title = "Learning Text Similarity with {S}iamese Recurrent Networks",
author = "Neculoiu, Paul and Versteegh, Maarten and Rotaru, Mihai",
booktitle = "Proceedings of the 1st Workshop on Representation Learning
for {NLP}",
month = aug,
year = 2016,
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W16-1617",
doi = "10.18653/v1/W16-1617",
pages = "148-157"
}
@inproceedings{wang-2018-learn-ask,
title = "Learning to Ask Questions in Open-domain Conversational
Systems with Typed Decoders",
author = "Wang, Yansen and Liu, Chenyi and Huang, Minlie and Nie,
Liqiang",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1204",
doi = "10.18653/v1/P18-1204",
pages = "2193-2203",
abstract = "Asking good questions in open-domain conversational systems
is quite significant but rather untouched. This task,
substantially different from traditional question generation,
requires to question not only with various patterns but also
on diverse and relevant topics. We observe that a good
question is a natural composition of interrogatives, topic
words, and ordinary words. Interrogatives lexicalize the
pattern of questioning, topic words address the key
information for topic transition in dialogue, and ordinary
words play syntactical and grammatical roles in making a
natural sentence. We devise two typed decoders (soft typed
decoder and hard typed decoder) in which a type distribution
over the three types is estimated and the type distribution
is used to modulate the final generation
distribution. Extensive experiments show that the typed
decoders outperform state-of-the-art baselines and can
generate more meaningful questions."
}
@article{seo-2016-bidaf,
author = "Min Joon Seo and Aniruddha Kembhavi and Ali Farhadi and
Hannaneh Hajishirzi",
title = "Bidirectional Attention Flow for Machine Comprehension",
journal = "CoRR",
volume = "abs/1611.01603",
year = 2016,
url = "http://arxiv.org/abs/1611.01603",
archivePrefix= "arXiv",
eprint = "1611.01603",
timestamp = "Mon, 13 Aug 2018 16:46:34 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/SeoKFH16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{yu-2018-qanet,
author = "Adams Wei Yu and David Dohan and Minh{-}Thang Luong and Rui
Zhao and Kai Chen and Mohammad Norouzi and Quoc V. Le",
title = "QANet: Combining Local Convolution with Global Self-Attention
for Reading Comprehension",
journal = "CoRR",
volume = "abs/1804.09541",
year = 2018,
url = "http://arxiv.org/abs/1804.09541",
archivePrefix= "arXiv",
eprint = "1804.09541",
timestamp = "Mon, 13 Aug 2018 16:48:18 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-09541",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{ture-2017-no-need-pay-atten,
title = "No Need to Pay Attention: Simple Recurrent Neural Networks
Work!",
author = "Ture, Ferhan and Jojic, Oliver",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1307",
doi = "10.18653/v1/D17-1307",
pages = "2866-2872",
abstract = "First-order factoid question answering assumes that the
question can be answered by a single fact in a knowledge base
(KB). While this does not seem like a challenging task, many
recent attempts that apply either complex linguistic
reasoning or deep neural networks achieve 65{\%}{--}76{\%}
accuracy on benchmark sets. Our approach formulates the task
as two machine learning problems: detecting the entities in
the question, and classifying the question as one of the
relation types in the KB. We train a recurrent neural network
to solve each problem. On the SimpleQuestions dataset, our
approach yields substantial improvements over previously
published results {---} even neural networks based on much
more complex architectures. The simplicity of our approach
also has practical advantages, such as efficiency and
modularity, that are valuable especially in an industry
setting. In fact, we present a preliminary analysis of the
performance of our model on real queries from Comcast{'}s X1
entertainment platform with millions of users every day."
}
@inproceedings{yu-2017-improv-neural,
title = "Improved Neural Relation Detection for Knowledge Base
Question Answering",
author = "Yu, Mo and Yin, Wenpeng and Hasan, Kazi Saidul and dos
Santos, Cicero and Xiang, Bing and Zhou, Bowen",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2017,
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1053",
doi = "10.18653/v1/P17-1053",
pages = "571-581",
abstract = "Relation detection is a core component of many NLP
applications including Knowledge Base Question Answering
(KBQA). In this paper, we propose a hierarchical recurrent
neural network enhanced by residual learning which detects KB
relations given an input question. Our method uses deep
residual bidirectional LSTMs to compare questions and
relation names via different levels of
abstraction. Additionally, we propose a simple KBQA system
that integrates entity linking and our proposed relation
detector to make the two components enhance each other. Our
experimental results show that our approach not only achieves
outstanding relation detection performance, but more
importantly, it helps our KBQA system achieve
state-of-the-art accuracy for both single-relation
(SimpleQuestions) and multi-relation (WebQSP) QA benchmarks."
}
@inproceedings{he-2017-gener-natur,
title = "Generating Natural Answers by Incorporating Copying and
Retrieving Mechanisms in Sequence-to-Sequence Learning",
author = "He, Shizhu and Liu, Cao and Liu, Kang and Zhao, Jun",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2017,
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1019",
doi = "10.18653/v1/P17-1019",
pages = "199-208",
abstract = "Generating answer with natural language sentence is very
important in real-world question answering systems, which
needs to obtain a right answer as well as a coherent natural
response. In this paper, we propose an end-to-end question
answering system called COREQA in sequence-to-sequence
learning, which incorporates copying and retrieving
mechanisms to generate natural answers within an
encoder-decoder framework. Specifically, in COREQA, the
semantic units (words, phrases and entities) in a natural
answer are dynamically predicted from the vocabulary, copied
from the given question and/or retrieved from the
corresponding knowledge base jointly. Our empirical study on
both synthetic and real-world datasets demonstrates the
efficiency of COREQA, which is able to generate correct,
coherent and natural answers for knowledge inquired
questions."
}
@inproceedings{madotto-2018-mem2seq,
title = "{M}em2{S}eq: Effectively Incorporating Knowledge Bases into
End-to-End Task-Oriented Dialog Systems",
author = "Madotto, Andrea and Wu, Chien-Sheng and Fung, Pascale",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1136",
doi = "10.18653/v1/P18-1136",
pages = "1468-1478",
abstract = "End-to-end task-oriented dialog systems usually suffer from
the challenge of incorporating knowledge bases. In this
paper, we propose a novel yet simple end-to-end
differentiable model called memory-to-sequence (Mem2Seq) to
address this issue. Mem2Seq is the first neural generative
model that combines the multi-hop attention over memories
with the idea of pointer network. We empirically show how
Mem2Seq controls each generation step, and how its multi-hop
attention mechanism helps in learning correlations between
memories. In addition, our model is quite general without
complicated task-specific designs. As a result, we show that
Mem2Seq can be trained faster and attain the state-of-the-art
performance on three different task-oriented dialog
datasets."
}
@article{cheng-2016-wide-deep,
author = "Heng{-}Tze Cheng and Levent Koc and Jeremiah Harmsen and Tal
Shaked and Tushar Chandra and Hrishi Aradhye and Glen
Anderson and Greg Corrado and Wei Chai and Mustafa Ispir and
Rohan Anil and Zakaria Haque and Lichan Hong and Vihan Jain
and Xiaobing Liu and Hemal Shah",
title = "Wide {\&} Deep Learning for Recommender Systems",
journal = "CoRR",
volume = "abs/1606.07792",
year = 2016,
url = "http://arxiv.org/abs/1606.07792",
archivePrefix= "arXiv",
eprint = "1606.07792",
timestamp = "Mon, 13 Aug 2018 16:47:53 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ChengKHSCAACCIA16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{wang-2019-multi-passage-bert,
author = "{Wang}, Zhiguo and {Ng}, Patrick and {Ma}, Xiaofei and
{Nallapati}, Ramesh and {Xiang}, Bing",
title = "{Multi-passage BERT: A Globally Normalized BERT Model for
Open-domain Question Answering}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence",
year = 2019,
month = "Aug",
eid = "arXiv:1908.08167",
pages = "arXiv:1908.08167",
archivePrefix= "arXiv",
eprint = "1908.08167",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190808167W",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{sun-2019-how-fine,
author = "Chi Sun and Xipeng Qiu and Yige Xu and Xuanjing Huang",
title = "How to Fine-Tune {BERT} for Text Classification?",
journal = "CoRR",
volume = "abs/1905.05583",
year = 2019,
url = "http://arxiv.org/abs/1905.05583",
archivePrefix= "arXiv",
eprint = "1905.05583",
timestamp = "Tue, 28 May 2019 12:48:08 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1905-05583",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{diefenbach-2018-core-techn,
author = "Diefenbach, Dennis and Lopez, Vanessa and Singh, Kamal and
Maret, Pierre",
title = "Core Techniques of Question Answering Systems over Knowledge
Bases: A Survey",
journal = "Knowl. Inf. Syst.",
issue_date = "June 2018",
volume = 55,
number = 3,
month = jun,
year = 2018,
issn = "0219-1377",
pages = "529-569",
numpages = 41,
url = "https://doi.org/10.1007/s10115-017-1100-y",
doi = "10.1007/s10115-017-1100-y",
acmid = 3210959,
publisher = "Springer-Verlag",
address = "Berlin, Heidelberg",
keywords = "Knowledge base, QALD, Question answering, Semantic Web,
SimpleQuestions, Survey, WebQuestions"
}
@inproceedings{rakthanmanon-2012-search,
title = "Searching and mining trillions of time series subsequences
under dynamic time warping",
author = "Rakthanmanon, Thanawin and Campana, Bilson and Mueen,
Abdullah and Batista, Gustavo and Westover, Brandon and Zhu,
Qiang and Zakaria, Jesin and Keogh, Eamonn",
booktitle = "Proceedings of the 18th ACM SIGKDD international conference
on Knowledge discovery and data mining",
pages = "262-270",
year = 2012,
organization = "ACM"
}
@inproceedings{palatucci-2009-zero-shot,
title = "Zero-shot learning with semantic output codes",
author = "Palatucci, Mark and Pomerleau, Dean and Hinton, Geoffrey E
and Mitchell, Tom M",
booktitle = "Advances in neural information processing systems",
pages = "1410-1418",
year = 2009
}
@article{fei-fei-2006-one-shot,
title = "One-shot learning of object categories",
author = "Fei-Fei, Li and Fergus, Rob and Perona, Pietro",
journal = "IEEE transactions on pattern analysis and machine
intelligence",
volume = 28,
number = 4,
pages = "594-611",
year = 2006,
publisher = "IEEE"
}
@inproceedings{ganin-2015-unsup-domain-adapt-backp,
author = "Ganin, Yaroslav and Lempitsky, Victor",
title = "Unsupervised Domain Adaptation by Backpropagation",
booktitle = "Proceedings of the 32Nd International Conference on
International Conference on Machine Learning - Volume 37",
series = "ICML'15",
year = 2015,
location = "Lille, France",
pages = "1180-1189",
numpages = 10,
url = "http://dl.acm.org/citation.cfm?id=3045118.3045244",
acmid = 3045244,
publisher = "JMLR.org"
}
@Article{liu-2018-multi-relations,
author = "Liu, Jin and Ren, Haoliang and Wu, Menglong and Wang, Jin and
Kim, Hye-jin",
title = "Multiple relations extraction among multiple entities in
unstructured text",
journal = "Soft Computing",
year = 2018,
month = "Jul",
day = 01,
volume = 22,
number = 13,
pages = "4295-4305",
abstract = "Relations extraction is a widely researched topic in nature
language processing. However, most of the work in the
literature concentrate on the methods that are dealing with
single relation between two named entities. In the task of
multiple relations extraction, traditional statistic-based
methods have difficulties in selecting features and improving
the performance of extraction model. In this paper, we
presented formal definitions of multiple entities and
multiple relations and put forward three labeling methods
which were used to label entity categories, relation
categories and relation conditions. We also proposed a novel
relation extraction model which is based on dynamic long
short-term memory network. To train our model, entity
feature, entity position feature and part of speech feature
are used together. These features are used to describe
complex relations and improve the performance of relation
extraction model. In the experiments, we classified the
corpus into three sets which are composed of 0--20 words,
20--35 words and 35+ words sentences. On conll04.corp, the
final precision, recall rate and F-measure reached 72.9, 70.8
and 67.9{\%} respectively.",
issn = "1433-7479",
doi = "10.1007/s00500-017-2852-8",
url = "https://doi.org/10.1007/s00500-017-2852-8"
}
@article{bekoulis-2018-joint-entity,
author = "Giannis Bekoulis and Johannes Deleu and Thomas Demeester and
Chris Develder",
title = "Joint entity recognition and relation extraction as a
multi-head selection problem",
journal = "CoRR",
volume = "abs/1804.07847",
year = 2018,
url = "http://arxiv.org/abs/1804.07847",
archivePrefix= "arXiv",
eprint = "1804.07847",
timestamp = "Mon, 13 Aug 2018 16:49:03 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-07847",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{wang-2019-one-pass,
author = "Haoyu Wang and Ming Tan and Mo Yu and Shiyu Chang and Dakuo
Wang and Kun Xu and Xiaoxiao Guo and Saloni Potdar",
title = "Extracting Multiple-Relations in One-Pass with Pre-Trained
Transformers",
journal = "CoRR",
volume = "abs/1902.01030",
year = 2019,
url = "http://arxiv.org/abs/1902.01030",
archivePrefix= "arXiv",
eprint = "1902.01030",
timestamp = "Tue, 21 May 2019 18:03:37 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1902-01030",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{soares-2019-match-blank,
author = "Livio Baldini Soares and Nicholas FitzGerald and Jeffrey Ling
and Tom Kwiatkowski",
title = "Matching the Blanks: Distributional Similarity for Relation
Learning",
journal = "CoRR",
volume = "abs/1906.03158",
year = 2019,
url = "http://arxiv.org/abs/1906.03158",
archivePrefix= "arXiv",
eprint = "1906.03158",
timestamp = "Fri, 14 Jun 2019 09:38:24 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1906-03158",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{alt-2019-improv-relat,
author = "Christoph Alt and Marc H{\"{u}}bner and Leonhard Hennig",
title = "Improving Relation Extraction by Pre-trained Language
Representations",
journal = "CoRR",
volume = "abs/1906.03088",
year = 2019,
url = "http://arxiv.org/abs/1906.03088",
archivePrefix= "arXiv",
eprint = "1906.03088",
timestamp = "Fri, 14 Jun 2019 09:38:24 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1906-03088",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{shi-2019-simple-bert,
author = "Peng Shi and Jimmy Lin",
title = "Simple {BERT} Models for Relation Extraction and Semantic
Role Labeling",
journal = "CoRR",
volume = "abs/1904.05255",
year = 2019,
url = "http://arxiv.org/abs/1904.05255",
archivePrefix= "arXiv",
eprint = "1904.05255",
timestamp = "Thu, 25 Apr 2019 13:55:01 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1904-05255",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{yao-2019-docred,
author = "Yuan Yao and Deming Ye and Peng Li and Xu Han and Yankai Lin
and Zhenghao Liu and Zhiyuan Liu and Lixin Huang and Jie Zhou
and Maosong Sun",
title = "DocRED: {A} Large-Scale Document-Level Relation Extraction
Dataset",
journal = "CoRR",
volume = "abs/1906.06127",
year = 2019,
url = "http://arxiv.org/abs/1906.06127",
archivePrefix= "arXiv",
eprint = "1906.06127",
timestamp = "Tue, 23 Jul 2019 15:49:40 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1906-06127",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{fu-2019-graphrel,
title = "{G}raph{R}el: Modeling Text as Relational Graphs for Joint
Entity and Relation Extraction",
author = "Fu, Tsu-Jui and Li, Peng-Hsuan and Ma, Wei-Yun",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1136",
doi = "10.18653/v1/P19-1136",
pages = "1409-1418",
abstract = "In this paper, we present GraphRel, an end-to-end relation
extraction model which uses graph convolutional networks
(GCNs) to jointly learn named entities and relations. In
contrast to previous baselines, we consider the interaction
between named entities and relations via a 2nd-phase
relation-weighted GCN to better extract relations. Linear and
dependency structures are both used to extract both
sequential and regional features of the text, and a complete
word graph is further utilized to extract implicit features
among all word pairs of the text. With the graph-based
approach, the prediction for overlapping relations is
substantially improved over previous sequential
approaches. We evaluate GraphRel on two public datasets: NYT
and WebNLG. Results show that GraphRel maintains high
precision while increasing recall substantially. Also,
GraphRel outperforms previous work by 3.2{\%} and 5.8{\%} (F1
score), achieving a new state-of-the-art for relation
extraction."
}
@article{quirk-2016-distan-super,
author = "Chris Quirk and Hoifung Poon",
title = "Distant Supervision for Relation Extraction beyond the
Sentence Boundary",
journal = "CoRR",
volume = "abs/1609.04873",
year = 2016,
url = "http://arxiv.org/abs/1609.04873",
archivePrefix= "arXiv",
eprint = "1609.04873",
timestamp = "Mon, 13 Aug 2018 16:49:11 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/QuirkP16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{peng-2017-cross-sentence,
author = "Nanyun Peng and Hoifung Poon and Chris Quirk and Kristina
Toutanova and Wen{-}tau Yih",
title = "Cross-Sentence N-ary Relation Extraction with Graph LSTMs",
journal = "CoRR",
volume = "abs/1708.03743",
year = 2017,
url = "http://arxiv.org/abs/1708.03743",
archivePrefix= "arXiv",
eprint = "1708.03743",
timestamp = "Mon, 13 Aug 2018 16:48:58 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1708-03743",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{song-2018-n-ary,
author = "Linfeng Song and Yue Zhang and Zhiguo Wang and Daniel Gildea",
title = "N-ary Relation Extraction using Graph State {LSTM}",
journal = "CoRR",
volume = "abs/1808.09101",
year = 2018,
url = "http://arxiv.org/abs/1808.09101",
archivePrefix= "arXiv",
eprint = "1808.09101",
timestamp = "Mon, 03 Sep 2018 13:36:40 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1808-09101",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{zhang-2019-drug-drug,
author = "Zhang, Tianlin and Leng, Jiaxu and Liu, Ying",
title = "{Deep learning for drug–drug interaction extraction from the
literature: a review}",
journal = "Briefings in Bioinformatics",
year = 2019,
month = 11,
abstract = "{Drug–drug interactions (DDIs) are crucial for drug research
and pharmacovigilance. These interactions may cause adverse
drug effects that threaten public health and patient
safety. Therefore, the DDIs extraction from biomedical
literature has been widely studied and emphasized in modern
biomedical research. The previous rules-based and machine
learning approaches rely on tedious feature engineering,
which is labourious, time-consuming and unsatisfactory. With
the development of deep learning technologies, this problem
is alleviated by learning feature representations
automatically. Here, we review the recent deep learning
methods that have been applied to the extraction of DDIs from
biomedical literature. We describe each method briefly and
compare its performance in the DDI corpus
systematically. Next, we summarize the advantages and
disadvantages of these deep learning models for this
task. Furthermore, we discuss some challenges and future
perspectives of DDI extraction via deep learning
methods. This review aims to serve as a useful guide for
interested researchers to further advance bioinformatics
algorithms for DDIs extraction from the literature.}",
issn = "1477-4054",
doi = "10.1093/bib/bbz087",
url = "https://doi.org/10.1093/bib/bbz087",
note = "bbz087",
eprint =
"http://oup.prod.sis.lan/bib/advance-article-pdf/doi/10.1093/bib/bbz087/30342664/bbz087.pdf"
}
@article{zheng-2017-joint-entity,
title = "Joint entity and relation extraction based on a hybrid neural
network",
author = "Zheng, Suncong and Hao, Yuexing and Lu, Dongyuan and Bao,
Hongyun and Xu, Jiaming and Hao, Hongwei and Xu, Bo",
journal = "Neurocomputing",
volume = 257,
pages = "59-66",
year = 2017,
publisher = "Elsevier"
}
@article{li-2017-neural-joint,
title = "A neural joint model for entity and relation extraction from
biomedical text",
author = "Li, Fei and Zhang, Meishan and Fu, Guohong and Ji, Donghong",
journal = "BMC bioinformatics",
volume = 18,
number = 1,
pages = 198,
year = 2017,
publisher = "BioMed Central"
}
@inproceedings{bekoulis-2018-adver,
title = "Adversarial training for multi-context joint entity and
relation extraction",
author = "Bekoulis, Giannis and Deleu, Johannes and Demeester, Thomas
and Develder, Chris",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1307",
doi = "10.18653/v1/D18-1307",
pages = "2830-2836",
abstract = "Adversarial training (AT) is a regularization method that can
be used to improve the robustness of neural network methods
by adding small perturbations in the training data. We show
how to use AT for the tasks of entity recognition and
relation extraction. In particular, we demonstrate that
applying AT to a general purpose baseline model for jointly
extracting entities and relations, allows improving the
state-of-the-art effectiveness on several datasets in
different contexts (i.e., news, biomedical, and real estate
data) and for different languages (English and Dutch)."
}
@inproceedings{verga-2018-simul-self,
title = "Simultaneously Self-Attending to All Mentions for
Full-Abstract Biological Relation Extraction",
author = "Verga, Patrick and Strubell, Emma and McCallum, Andrew",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = 2018,
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1080",
doi = "10.18653/v1/N18-1080",
pages = "872-884",
abstract = "Most work in relation extraction forms a prediction by
looking at a short span of text within a single sentence
containing a single entity pair mention. This approach often
does not consider interactions across mentions, requires
redundant computation for each mention pair, and ignores
relationships expressed across sentence boundaries. These
problems are exacerbated by the document- (rather than
sentence-) level annotation common in biological text. In
response, we propose a model which simultaneously predicts
relationships between all mention pairs in a document. We
form pairwise predictions over entire paper abstracts using
an efficient self-attention encoder. All-pairs mention scores
allow us to perform multi-instance learning by aggregating
over mentions to form entity pair representations. We further
adapt to settings without mention-level annotation by jointly
training to predict named entities and adding a corpus of
weakly labeled data. In experiments on two Biocreative
benchmark datasets, we achieve state of the art performance
on the Biocreative V Chemical Disease Relation dataset for
models without external KB resources. We also introduce a new
dataset an order of magnitude larger than existing
human-annotated biological information extraction datasets
and more accurate than distantly supervised alternatives."
}
@article{nguyen-2018-end-to-end,
author = "Dat Quoc Nguyen and Karin Verspoor",
title = "End-to-end neural relation extraction using deep biaffine
attention",
journal = "CoRR",
volume = "abs/1812.11275",
year = 2018,
url = "http://arxiv.org/abs/1812.11275",
archivePrefix= "arXiv",
eprint = "1812.11275",
timestamp = "Wed, 02 Jan 2019 14:40:18 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1812-11275",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{yan-2019-unified-model,
author = "Hang Yan and Xipeng Qiu and Xuanjing Huang",
title = "A Unified Model for Joint Chinese Word Segmentation and
Dependency Parsing",
journal = "CoRR",
volume = "abs/1904.04697",
year = 2019,
url = "http://arxiv.org/abs/1904.04697",
archivePrefix= "arXiv",
eprint = "1904.04697",
timestamp = "Thu, 25 Apr 2019 13:55:01 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1904-04697",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{zhang-2017-slot-filling,
title = "Position-aware Attention and Supervised Data Improve Slot
Filling",
author = "Zhang, Yuhao and Zhong, Victor and Chen, Danqi and Angeli,
Gabor and Manning, Christopher D.",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1004",
doi = "10.18653/v1/D17-1004",
pages = "35-45",
abstract = "Organized relational knowledge in the form of {``}knowledge
graphs{''} is important for many applications. However, the
ability to populate knowledge bases with facts automatically
extracted from documents has improved frustratingly
slowly. This paper simultaneously addresses two issues that
have held back prior work. We first propose an effective new
model, which combines an LSTM sequence model with a form of
entity position-aware attention that is better suited to
relation extraction. Then we build TACRED, a large (119,474
examples) supervised relation extraction dataset obtained via
crowdsourcing and targeted towards TAC KBP relations. The
combination of better supervised data and a more appropriate
high-capacity model enables much better relation extraction
performance. When the model trained on this new dataset
replaces the previous relation extraction component of the
best TAC KBP 2015 slot filling system, its F1 score increases
markedly from 22.2{\%} to 26.7{\%}."
}
@inproceedings{han-2018-fewrel,
title = "{F}ew{R}el: A Large-Scale Supervised Few-Shot Relation
Classification Dataset with State-of-the-Art Evaluation",
author = "Han, Xu and Zhu, Hao and Yu, Pengfei and Wang, Ziyun and Yao,
Yuan and Liu, Zhiyuan and Sun, Maosong",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1514",
doi = "10.18653/v1/D18-1514",
pages = "4803-4809",
abstract = "We present a Few-Shot Relation Classification Dataset
(dataset), consisting of 70, 000 sentences on 100 relations
derived from Wikipedia and annotated by crowdworkers. The
relation of each sentence is first recognized by distant
supervision methods, and then filtered by crowdworkers. We
adapt the most recent state-of-the-art few-shot learning
methods for relation classification and conduct thorough
evaluation of these methods. Empirical results show that even
the most competitive few-shot learning models struggle on
this task, especially as compared with humans. We also show
that a range of different reasoning skills are needed to
solve our task. These results indicate that few-shot relation
classification remains an open problem and still requires
further research. Our detailed analysis points multiple
directions for future research."
}
@article{levy-2017-zero-shot,
author = "Omer Levy and Minjoon Seo and Eunsol Choi and Luke
Zettlemoyer",
title = "Zero-Shot Relation Extraction via Reading Comprehension",
journal = "CoRR",
volume = "abs/1706.04115",
year = 2017,
url = "http://arxiv.org/abs/1706.04115",
archivePrefix= "arXiv",
eprint = "1706.04115",
timestamp = "Mon, 13 Aug 2018 16:46:48 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/LevySCZ17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{gao-2019-fewrel-2,
title = "{F}ew{R}el 2.0: Towards More Challenging Few-Shot Relation
Classification",
author = "Gao, Tianyu and Han, Xu and Zhu, Hao and Liu, Zhiyuan and Li,
Peng and Sun, Maosong and Zhou, Jie",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1649",
doi = "10.18653/v1/D19-1649",
pages = "6251-6256",
abstract = "We present FewRel 2.0, a more challenging task to investigate
two aspects of few-shot relation classification models: (1)
Can they adapt to a new domain with only a handful of
instances? (2) Can they detect none-of-the-above (NOTA)
relations? To construct FewRel 2.0, we build upon the FewRel
dataset by adding a new test set in a quite different domain,
and a NOTA relation choice. With the new dataset and
extensive experimental analysis, we found (1) that the
state-of-the-art few-shot relation classification models
struggle on these two aspects, and (2) that the commonly-used
techniques for domain adaptation and NOTA detection still
cannot handle the two challenges well. Our research calls for
more attention and further efforts to these two real-world
issues. All details and resources about the dataset and
baselines are released at https://github.com/thunlp/fewrel."
}
@article{snell-2017-prototypical-networks,
author = "Jake Snell and Kevin Swersky and Richard S. Zemel",
title = "Prototypical Networks for Few-shot Learning",
journal = "CoRR",
volume = "abs/1703.05175",
year = 2017,
url = "http://arxiv.org/abs/1703.05175",
archivePrefix= "arXiv",
eprint = "1703.05175",
timestamp = "Mon, 13 Aug 2018 16:46:05 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/SnellSZ17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{cui-2019-bilstm-lan,
author = "{Cui}, Leyang and {Zhang}, Yue",
title = "{Hierarchically-Refined Label Attention Network for Sequence
Labeling}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2019,
month = "Aug",
eid = "arXiv:1908.08676",
pages = "arXiv:1908.08676",
archivePrefix= "arXiv",
eprint = "1908.08676",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190808676C",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{xu-2019-scalin-open,
title = "Scaling up Open Tagging from Tens to Thousands: Comprehension
Empowered Attribute Value Extraction from Product Title",
author = "Xu, Huimin and Wang, Wenting and Mao, Xin and Jiang, Xinyu
and Lan, Man",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1514",
doi = "10.18653/v1/P19-1514",
pages = "5214-5223",
abstract = "Supplementing product information by extracting attribute
values from title is a crucial task in e-Commerce
domain. Previous studies treat each attribute only as an
entity type and build one set of NER tags (e.g., BIO) for
each of them, leading to a scalability issue which unfits to
the large sized attribute system in real world e-Commerce. In
this work, we propose a novel approach to support value
extraction scaling up to thousands of attributes without
losing performance: (1) We propose to regard attribute as a
query and adopt only one global set of BIO tags for any
attributes to reduce the burden of attribute tag or model
explosion; (2) We explicitly model the semantic
representations for attribute and title, and develop an
attention mechanism to capture the interactive semantic
relations in-between to enforce our framework to be attribute
comprehensive. We conduct extensive experiments in real-life
datasets. The results show that our model not only
outperforms existing state-of-the-art NER tagging models, but
also is robust and generates promising results for up to
8,906 attributes."
}
@article{zheng-2018-opentag,
author = "Guineng Zheng and Subhabrata Mukherjee and Xin Luna Dong and
Feifei Li",
title = "OpenTag: Open Attribute Value Extraction from Product
Profiles",
journal = "CoRR",
volume = "abs/1806.01264",
year = 2018,
url = "http://arxiv.org/abs/1806.01264",
archivePrefix= "arXiv",
eprint = "1806.01264",
timestamp = "Mon, 13 Aug 2018 16:46:56 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1806-01264",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{yahya-2014-renoun,
title = "{R}e{N}oun: Fact Extraction for Nominal Attributes",
author = "Yahya, Mohamed and Whang, Steven and Gupta, Rahul and Halevy,
Alon",
booktitle = "Proceedings of the 2014 Conference on Empirical Methods in
Natural Language Processing ({EMNLP})",
month = oct,
year = 2014,
address = "Doha, Qatar",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D14-1038",
doi = "10.3115/v1/D14-1038",
pages = "325-335"
}
@article{jiang-2017-metapad,
author = "Meng Jiang and Jingbo Shang and Taylor Cassidy and Xiang Ren
and Lance M. Kaplan and Timothy P. Hanratty and Jiawei Han",
title = "MetaPAD: Meta Pattern Discovery from Massive Text Corpora",
journal = "CoRR",
volume = "abs/1703.04213",
year = 2017,
url = "http://arxiv.org/abs/1703.04213",
archivePrefix= "arXiv",
eprint = "1703.04213",
timestamp = "Mon, 13 Aug 2018 16:48:27 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/JiangSCRKHH17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{zhang-2019-unsup-annot,
author = "{Zhang}, Jingqing and {Zhang}, Xiaoyu and {Sun}, Kai and
{Yang}, Xian and {Dai}, Chengliang and {Guo}, Yike",
title = "{Unsupervised Annotation of Phenotypic Abnormalities via
Semantic Latent Representations on Electronic Health
Records}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2019,
month = "Nov",
eid = "arXiv:1911.03862",
pages = "arXiv:1911.03862",
archivePrefix= "arXiv",
eprint = "1911.03862",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv191103862Z",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{liu-2015-segphrase,
title = "Mining quality phrases from massive text corpora",
author = "Liu, Jialu and Shang, Jingbo and Wang, Chi and Ren, Xiang and
Han, Jiawei",
booktitle = "Proceedings of the 2015 ACM SIGMOD International Conference
on Management of Data",
pages = "1729-1744",
year = 2015,
organization = "ACM"
}
@article{shang-2017-autophrase,
author = "Jingbo Shang and Jialu Liu and Meng Jiang and Xiang Ren and
Clare R. Voss and Jiawei Han",
title = "Automated Phrase Mining from Massive Text Corpora",
journal = "CoRR",
volume = "abs/1702.04457",
year = 2017,
url = "http://arxiv.org/abs/1702.04457",
archivePrefix= "arXiv",
eprint = "1702.04457",
timestamp = "Mon, 13 Aug 2018 16:46:43 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ShangLJRVH17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{el-kishky-2014-topmining,
title = "Scalable topical phrase mining from text corpora",
author = "El-Kishky, Ahmed and Song, Yanglei and Wang, Chi and Voss,
Clare R and Han, Jiawei",
journal = "Proceedings of the VLDB Endowment",
volume = 8,
number = 3,
pages = "305-316",
year = 2014,
publisher = "VLDB Endowment"
}
@inproceedings{wang-2019-autobioner,
title = "Distantly Supervised Biomedical Named Entity Recognition with
Dictionary Expansion",
author = "Wang, Xuan and Zhang, Yu and Li, Qi and Ren, Xiang and Shang,
Jingbo and Han, Jiawei",
booktitle = "Proc. 2019 IEEE Int. Conf. on Bioinformatics and Biomedicine
(IEEE-BIBM’19), San Diego, CA",
year = 2019
}
@inproceedings{shen-2017-setexpan,
title = "Setexpan: Corpus-based set expansion via context feature
selection and rank ensemble",
author = "Shen, Jiaming and Wu, Zeqiu and Lei, Dongming and Shang,
Jingbo and Ren, Xiang and Han, Jiawei",
booktitle = "Joint European Conference on Machine Learning and Knowledge
Discovery in Databases",
pages = "288-304",
year = 2017,
organization = "Springer"
}
@inproceedings{rong-2016-egoset,
title = "Egoset: Exploiting word ego-networks and user-generated
ontology for multifaceted set expansion",
author = "Rong, Xin and Chen, Zhe and Mei, Qiaozhu and Adar, Eytan",
booktitle = "Proceedings of the Ninth ACM international conference on Web
search and data mining",
pages = "645-654",
year = 2016,
organization = "ACM"
}
@inproceedings{lin-2008-textcube,
author = "Lin, Cindy Xide and Ding, Bolin and Han, Jiawei and Zhu,
Feida and Zhao, Bo",
title = "Text cube: Computing ir measures for multidimensional text
database analysis",
booktitle = "2008 Eighth IEEE International Conference on Data Mining",
year = 2008,
pages = "905-910",
organization = "IEEE"
}
@article{sun-2011-pathsim,
author = "Sun, Yizhou and Han, Jiawei and Yan, Xifeng and Yu, Philip S
and Wu, Tianyi",
title = "Pathsim: Meta Path-Based Top-K Similarity Search in
Heterogeneous Information Networks",
journal = "Proceedings of the VLDB Endowment",
volume = 4,
number = 11,
pages = "992-1003",
year = 2011,
publisher = "Citeseer"
}
@inproceedings{ren-2015-clustype,
author = "Ren, Xiang and El-Kishky, Ahmed and Wang, Chi and Tao, Fangbo
and Voss, Clare R and Han, Jiawei",
title = "Clustype: Effective entity recognition and typing by relation
phrase-based clustering",
booktitle = "Proceedings of the 21th ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining",
year = 2015,
pages = "995-1004",
organization = "ACM"
}
@article{ren-2016-cotype,
author = "Xiang Ren and Zeqiu Wu and Wenqi He and Meng Qu and Clare
R. Voss and Heng Ji and Tarek F. Abdelzaher and Jiawei Han",
title = "CoType: Joint Extraction of Typed Entities and Relations with
Knowledge Bases",
journal = "CoRR",
volume = "abs/1610.08763",
year = 2016,
url = "http://arxiv.org/abs/1610.08763",
archivePrefix= "arXiv",
eprint = "1610.08763",
timestamp = "Mon, 13 Aug 2018 16:46:29 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/RenWHQVJAH16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{ren-2016-afet,
title = "{AFET}: Automatic Fine-Grained Entity Typing by Hierarchical
Partial-Label Embedding",
author = "Ren, Xiang and He, Wenqi and Qu, Meng and Huang, Lifu and Ji,
Heng and Han, Jiawei",
booktitle = "Proceedings of the 2016 Conference on Empirical Methods in
Natural Language Processing",
month = nov,
year = 2016,
address = "Austin, Texas",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D16-1144",
doi = "10.18653/v1/D16-1144",
pages = "1369-1378"
}
@article{liu-2017-rehession,
author = "Liu, Liyuan and Ren, Xiang and Zhu, Qi and Zhi, Shi and Gui,
Huan and Ji, Heng and Han, Jiawei",
title = "Heterogeneous Supervision for Relation Extraction: a
Representation Learning Approach",
journal = "arXiv preprint arXiv:1707.00166",
year = 2017
}
@inproceedings{ren-2016-ple,
author = "Ren, Xiang and He, Wenqi and Qu, Meng and Voss, Clare R and
Ji, Heng and Han, Jiawei",
title = "Label noise reduction in entity typing by heterogeneous
partial-label embedding",
booktitle = "Proceedings of the 22nd ACM SIGKDD international conference
on Knowledge discovery and data mining",
year = 2016,
pages = "1825-1834",
organization = "ACM"
}
@inproceedings{qu-2017-auto-synonym,
title = "Automatic synonym discovery with knowledge bases",
author = "Qu, Meng and Ren, Xiang and Han, Jiawei",
booktitle = "Proceedings of the 23rd ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining",
pages = "997-1005",
year = 2017,
organization = "ACM"
}
@article{tao-2016-textcube-summarization,
title = "Multi-Dimensional, Phrase-Based Summarization in Text Cubes",
author = "Fangbo Tao and Honglei Zhuang and Chi Wang Yu and Qi Wang and
Taylor Cassidy and Lance M. Kaplan and Clare R. Voss and
Jiawei Han",
journal = "IEEE Data Eng. Bull.",
year = 2016,
volume = 39,
pages = "74-84"
}
@inproceedings{liu-2016-laki,
title = "Representing documents via latent keyphrase inference",
author = "Liu, Jialu and Ren, Xiang and Shang, Jingbo and Cassidy,
Taylor and Voss, Clare R and Han, Jiawei",
booktitle = "Proceedings of the 25th international conference on World
wide web",
pages = "1057-1067",
year = 2016,
organization = "International World Wide Web Conferences Steering Committee"
}
@article{hosseini-2018-heteromed,
author = "Anahita Hosseini and Ting Chen and Wenjun Wu and Yizhou Sun
and Majid Sarrafzadeh",
title = "HeteroMed: Heterogeneous Information Network for Medical
Diagnosis",
journal = "CoRR",
volume = "abs/1804.08052",
year = 2018,
url = "http://arxiv.org/abs/1804.08052",
archivePrefix= "arXiv",
eprint = "1804.08052",
timestamp = "Wed, 17 Apr 2019 16:16:59 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1804-08052",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{han-2017-mining-structs,
author = "Han, Jiawei",
title = "Mining Structures from Massive Text Data: A Data-Driven
Approach.",
booktitle = "SIMBig",
year = 2017,
pages = "16-19"
}
@article{gui-2018-exper-findin,
author = "Huan Gui and Qi Zhu and Liyuan Liu and Aston Zhang and Jiawei
Han",
title = "Expert Finding in Heterogeneous Bibliographic Networks with
Locally-trained Embeddings",
journal = "CoRR",
volume = "abs/1803.03370",
year = 2018,
url = "http://arxiv.org/abs/1803.03370",
archivePrefix= "arXiv",
eprint = "1803.03370",
timestamp = "Mon, 13 Aug 2018 16:48:03 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1803-03370",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{sun-2013-minin-heter-infor-networ,
author = "Sun, Yizhou and Han, Jiawei",
title = "Mining Heterogeneous Information Networks: a Structural
Analysis Approach",
journal = "Acm Sigkdd Explorations Newsletter",
volume = 14,
number = 2,
pages = "20-28",
year = 2013,
publisher = "ACM"
}
@inproceedings{sui-2019-cgn,
title = "Leverage Lexical Knowledge for {C}hinese Named Entity
Recognition via Collaborative Graph Network",
author = "Sui, Dianbo and Chen, Yubo and Liu, Kang and Zhao, Jun and
Liu, Shengping",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1396",
doi = "10.18653/v1/D19-1396",
pages = "3828-3838",
abstract = "The lack of word boundaries information has been seen as one
of the main obstacles to develop a high performance Chinese
named entity recognition (NER) system. Fortunately, the
automatically constructed lexicon contains rich word
boundaries information and word semantic
information. However, integrating lexical knowledge in
Chinese NER tasks still faces challenges when it comes to
self-matched lexical words as well as the nearest contextual
lexical words. We present a Collaborative Graph Network to
solve these challenges. Experiments on various datasets show
that our model not only outperforms the state-of-the-art
(SOTA) results, but also achieves a speed that is six to
fifteen times faster than that of the SOTA model."
}
@ARTICLE{2017arXiv171010903V,
author = "{Veli{\v{c}}kovi{\'c}}, Petar and {Cucurull}, Guillem and
{Casanova}, Arantxa and {Romero}, Adriana and {Li{\`o}}, Pietro and
{Bengio}, Yoshua",
title = "{Graph Attention Networks}",
journal = "arXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Machine Learning, Computer Science - Social
and Information Networks",
year = 2017,
month = "Oct",
eid = "arXiv:1710.10903",
pages = "arXiv:1710.10903",
archivePrefix= "arXiv",
eprint = "1710.10903",
primaryClass = "stat.ML",
adsurl = "https://ui.adsabs.harvard.edu/abs/2017arXiv171010903V",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{kipf-2016-gcn,
author = "{Kipf}, Thomas N. and {Welling}, Max",
title = "{Semi-Supervised Classification with Graph Convolutional
Networks}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Statistics - Machine
Learning",
year = 2016,
month = "Sep",
eid = "arXiv:1609.02907",
pages = "arXiv:1609.02907",
archivePrefix= "arXiv",
eprint = "1609.02907",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160902907K",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{niepert-2016-gcn,
title = "Learning convolutional neural networks for graphs",
author = "Niepert, Mathias and Ahmed, Mohamed and Kutzkov, Konstantin",
booktitle = "International conference on machine learning",
pages = "2014-2023",
year = 2016
}
@ARTICLE{velickovic-2017-gat,
author = "{Veli{\v{c}}kovi{\'c}}, Petar and {Cucurull}, Guillem and
{Casanova}, Arantxa and {Romero}, Adriana and {Li{\`o}},
Pietro and {Bengio}, Yoshua",
title = "{Graph Attention Networks}",
journal = "arXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Artificial
Intelligence, Computer Science - Machine Learning, Computer
Science - Social and Information Networks",
year = 2017,
month = "Oct",
eid = "arXiv:1710.10903",
pages = "arXiv:1710.10903",
archivePrefix= "arXiv",
eprint = "1710.10903",
primaryClass = "stat.ML",
adsurl = "https://ui.adsabs.harvard.edu/abs/2017arXiv171010903V",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{xue-2019-bert-joint,
author = "{Xue}, Kui and {Zhou}, Yangming and {Ma}, Zhiyuan and {Ruan},
Tong and {Zhang}, Huanhuan and {He}, Ping",
title = "{Fine-tuning BERT for Joint Entity and Relation Extraction in
Chinese Medical Text}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2019,
month = "Aug",
eid = "arXiv:1908.07721",
pages = "arXiv:1908.07721",
archivePrefix= "arXiv",
eprint = "1908.07721",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190807721X",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{jia-2019-doc-level,
author = "Robin Jia and Cliff Wong and Hoifung Poon",
title = "Document-Level N-ary Relation Extraction with Multiscale
Representation Learning",
journal = "CoRR",
volume = "abs/1904.02347",
year = 2019,
url = "http://arxiv.org/abs/1904.02347",
archivePrefix= "arXiv",
eprint = "1904.02347",
timestamp = "Wed, 24 Apr 2019 12:21:25 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1904-02347",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{gupta-2019-nested-ner,
title = "Linguistically Informed Relation Extraction and Neural
Architectures for Nested Named Entity Recognition in
{B}io{NLP}-{OST} 2019",
author = "Gupta, Pankaj and Yaseen, Usama and Sch{\"u}tze, Hinrich",
booktitle = "Proceedings of The 5th Workshop on BioNLP Open Shared Tasks",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-5720",
doi = "10.18653/v1/D19-5720",
pages = "132-142",
abstract = "Named Entity Recognition (NER) and Relation Extraction (RE)
are essential tools in distilling knowledge from biomedical
literature. This paper presents our findings from
participating in BioNLP Shared Tasks 2019. We addressed Named
Entity Recognition including nested entities extraction,
Entity Normalization and Relation Extraction. Our proposed
approach of Named Entities can be generalized to different
languages and we have shown it{'}s effectiveness for English
and Spanish text. We investigated linguistic features, hybrid
loss including ranking and Conditional Random Fields (CRF),
multi-task objective and token level ensembling strategy to
improve NER. We employed dictionary based fuzzy and semantic
search to perform Entity Normalization. Finally, our RE
system employed Support Vector Machine (SVM) with linguistic
features. Our NER submission (team:MIC-CIS) ranked first in
BB-2019 norm+NER task with standard error rate (SER) of
0.7159 and showed competitive performance on PharmaCo NER
task with F1-score of 0.8662. Our RE system ranked first in
the SeeDev-binary Relation Extraction Task with F1-score of
0.3738."
}
@inproceedings{guo-2019-aggcn,
title = "Attention Guided Graph Convolutional Networks for Relation
Extraction",
author = "Guo, Zhijiang and Zhang, Yan and Lu, Wei",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1024",
doi = "10.18653/v1/P19-1024",
pages = "241-251",
abstract = "Dependency trees convey rich structural information that is
proven useful for extracting relations among entities in
text. However, how to effectively make use of relevant
information while ignoring irrelevant information from the
dependency trees remains a challenging research
question. Existing approaches employing rule based
hard-pruning strategies for selecting relevant partial
dependency structures may not always yield optimal
results. In this work, we propose Attention Guided Graph
Convolutional Networks (AGGCNs), a novel model which directly
takes full dependency trees as inputs. Our model can be
understood as a soft-pruning approach that automatically
learns how to selectively attend to the relevant
sub-structures useful for the relation extraction
task. Extensive results on various tasks including
cross-sentence n-ary relation extraction and large-scale
sentence-level relation extraction show that our model is
able to better leverage the structural information of the
full dependency trees, giving significantly better results
than previous approaches."
}
@ARTICLE{he-2019-nre-pul,
author = "{He}, Zhengqiu and {Chen}, Wenliang and {Wang}, Yuyi and
{zhang}, Wei and {Wang}, Guanchun and {Zhang}, Min",
title = "{Improving Neural Relation Extraction with Positive and
Unlabeled Learning}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2019,
month = "Nov",
eid = "arXiv:1911.12556",
pages = "arXiv:1911.12556",
archivePrefix= "arXiv",
eprint = "1911.12556",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv191112556H",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{nayak-2019-nre-am,
title = "Effective Attention Modeling for Neural Relation Extraction",
author = "Nayak, Tapas and Ng, Hwee Tou",
booktitle = "Proceedings of the 23rd Conference on Computational Natural
Language Learning (CoNLL)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/K19-1056",
doi = "10.18653/v1/K19-1056",
pages = "603-612",
abstract = "Relation extraction is the task of determining the relation
between two entities in a sentence. Distantly-supervised
models are popular for this task. However, sentences can be
long and two entities can be located far from each other in a
sentence. The pieces of evidence supporting the presence of a
relation between two entities may not be very direct, since
the entities may be connected via some indirect links such as
a third entity or via co-reference. Relation extraction in
such scenarios becomes more challenging as we need to capture
the long-distance interactions among the entities and other
words in the sentence. Also, the words in a sentence do not
contribute equally in identifying the relation between the
two entities. To address this issue, we propose a novel and
effective attention model which incorporates syntactic
information of the sentence and a multi-factor attention
mechanism. Experiments on the New York Times corpus show that
our proposed model outperforms prior state-of-the-art
models."
}
@ARTICLE{shang-2019-noisy-dsre,
author = "{Shang}, Yuming",
title = "{Are Noisy Sentences Useless for Distant Supervised Relation
Extraction?}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Machine Learning",
year = 2019,
month = "Nov",
eid = "arXiv:1911.09788",
pages = "arXiv:1911.09788",
archivePrefix= "arXiv",
eprint = "1911.09788",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv191109788S",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{tran-2019-nml,
author = "Tung Tran and Ramakanth Kavuluru",
title = "Neural Metric Learning for Fast End-to-End Relation
Extraction",
journal = "CoRR",
volume = "abs/1905.07458",
year = 2019,
url = "http://arxiv.org/abs/1905.07458",
archivePrefix= "arXiv",
eprint = "1905.07458",
timestamp = "Wed, 28 Aug 2019 07:29:35 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1905-07458",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{amos-2017-optnet,
author = "Brandon Amos and J. Zico Kolter",
title = "OptNet: Differentiable Optimization as a Layer in Neural
Networks",
journal = "CoRR",
volume = "abs/1703.00443",
year = 2017,
url = "http://arxiv.org/abs/1703.00443",
archivePrefix= "arXiv",
eprint = "1703.00443",
timestamp = "Mon, 13 Aug 2018 16:48:26 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/AmosK17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{杨锦锋-2016-中文电子病历命名实体和实体关系语料库构建,
title = "中文电子病历命名实体和实体关系语料库构建",
author = "杨锦锋 and 关毅 and 何彬 and 曲春燕 and 于秋滨 and 刘雅欣 and
赵永杰",
journal = "软件学报",
number = 11,
pages = "2725-2746",
year = 2016
}
@inproceedings{kuru-2016-charner,
title = "{C}har{NER}: Character-Level Named Entity Recognition",
author = "Kuru, Onur and Can, Ozan Arkan and Yuret, Deniz",
booktitle = "Proceedings of {COLING} 2016, the 26th International
Conference on Computational Linguistics: Technical Papers",
month = dec,
year = 2016,
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://www.aclweb.org/anthology/C16-1087",
pages = "911-921",
abstract = "We describe and evaluate a character-level tagger for
language-independent Named Entity Recognition (NER). Instead
of words, a sentence is represented as a sequence of
characters. The model consists of stacked bidirectional LSTMs
which inputs characters and outputs tag probabilities for
each character. These probabilities are then converted to
consistent word level named entity tags using a Viterbi
decoder. We are able to achieve close to state-of-the-art NER
performance in seven languages with the same basic model
using only labeled NER data and no hand-engineered features
or other external resources like syntactic taggers or
Gazetteers."
}
@ARTICLE{ma-2016-lstm-cnn-crf,
author = "{Ma}, Xuezhe and {Hovy}, Eduard",
title = "{End-to-end Sequence Labeling via Bi-directional
LSTM-CNNs-CRF}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Computation and Language, Statistics - Machine Learning",
year = 2016,
month = "Mar",
eid = "arXiv:1603.01354",
pages = "arXiv:1603.01354",
archivePrefix= "arXiv",
eprint = "1603.01354",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160301354M",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{yang-2017-neural-reranking,
author = "Jie Yang and Yue Zhang and Fei Dong",
title = "Neural Reranking for Named Entity Recognition",
journal = "CoRR",
volume = "abs/1707.05127",
year = 2017,
url = "http://arxiv.org/abs/1707.05127",
archivePrefix= "arXiv",
eprint = "1707.05127",
timestamp = "Wed, 20 Nov 2019 08:54:08 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/YangZD17aa",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{li-2017-ner-recursive-nn,
title = "Leveraging Linguistic Structures for Named Entity Recognition
with Bidirectional Recursive Neural Networks",
author = "Li, Peng-Hsuan and Dong, Ruo-Ping and Wang, Yu-Siang and
Chou, Ju-Chieh and Ma, Wei-Yun",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1282",
doi = "10.18653/v1/D17-1282",
pages = "2664-2669",
abstract = "In this paper, we utilize the linguistic structures of texts
to improve named entity recognition by BRNN-CNN, a special
bidirectional recursive network attached with a convolutional
network. Motivated by the observation that named entities are
highly related to linguistic constituents, we propose a
constituent-based BRNN-CNN for named entity recognition. In
contrast to classical sequential labeling methods, the system
first identifies which text chunks are possible named
entities by whether they are linguistic constituents. Then it
classifies these chunks with a constituency tree structure by
recursively propagating syntactic and semantic information to
each constituent node. This method surpasses current
state-of-the-art on OntoNotes 5.0 with automatically
generated parses."
}
@inproceedings{tran-2017-stack-residual-lstm,
title = "Named Entity Recognition with Stack Residual {LSTM} and
Trainable Bias Decoding",
author = "Tran, Quan and MacKinlay, Andrew and Jimeno Yepes, Antonio",
booktitle = "Proceedings of the Eighth International Joint Conference on
Natural Language Processing (Volume 1: Long Papers)",
month = nov,
year = 2017,
address = "Taipei, Taiwan",
publisher = "Asian Federation of Natural Language Processing",
url = "https://www.aclweb.org/anthology/I17-1057",
pages = "566-575",
abstract = "Recurrent Neural Network models are the state-of-the-art for
Named Entity Recognition (NER). We present two innovations to
improve the performance of these models. The first innovation
is the introduction of residual connections between the
Stacked Recurrent Neural Network model to address the
degradation problem of deep neural networks. The second
innovation is a bias decoding mechanism that allows the
trained system to adapt to non-differentiable and externally
computed objectives, such as the entity-based F-measure. Our
work improves the state-of-the-art results for both Spanish
and English languages on the standard train/development/test
split of the CoNLL 2003 Shared Task NER dataset."
}
@article{wei-2016-disease-ner,
title = "Disease named entity recognition by combining conditional
random fields and bidirectional recurrent neural networks",
author = "Wei, Qikang and Chen, Tao and Xu, Ruifeng and He, Yulan and
Gui, Lin",
journal = "Database",
volume = 2016,
year = 2016,
publisher = "Oxford University Press"
}
@inproceedings{strubell-2017-id-cnn,
title = "Fast and Accurate Entity Recognition with Iterated Dilated
Convolutions",
author = "Strubell, Emma and Verga, Patrick and Belanger, David and
McCallum, Andrew",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1283",
doi = "10.18653/v1/D17-1283",
pages = "2670-2680",
abstract = "Today when many practitioners run basic NLP on the entire web
and large-volume traffic, faster methods are paramount to
saving time and energy costs. Recent advances in GPU hardware
have led to the emergence of bi-directional LSTMs as a
standard method for obtaining per-token vector
representations serving as input to labeling tasks such as
NER (often followed by prediction in a linear-chain
CRF). Though expressive and accurate, these models fail to
fully exploit GPU parallelism, limiting their computational
efficiency. This paper proposes a faster alternative to
Bi-LSTMs for NER: Iterated Dilated Convolutional Neural
Networks (ID-CNNs), which have better capacity than
traditional CNNs for large context and structured
prediction. Unlike LSTMs whose sequential processing on
sentences of length N requires O(N) time even in the face of
parallelism, ID-CNNs permit fixed-depth convolutions to run
in parallel across entire documents. We describe a distinct
combination of network structure, parameter sharing and
training procedures that enable dramatic 14-20x test-time
speedups while retaining accuracy comparable to the
Bi-LSTM-CRF. Moreover, ID-CNNs trained to aggregate context
from the entire document are more accurate than Bi-LSTM-CRFs
while attaining 8x faster test time speeds."
}
@inproceedings{lin-2017-multi-channel-bi-lstm-crf,
title = "Multi-channel {B}i{LSTM}-{CRF} Model for Emerging Named
Entity Recognition in Social Media",
author = "Lin, Bill Y. and Xu, Frank and Luo, Zhiyi and Zhu, Kenny",
booktitle = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-4421",
doi = "10.18653/v1/W17-4421",
pages = "160-165",
abstract = "In this paper, we present our multi-channel neural
architecture for recognizing emerging named entity in social
media messages, which we applied in the Novel and Emerging
Named Entity Recognition shared task at the EMNLP 2017
Workshop on Noisy User-generated Text (W-NUT). We propose a
novel approach, which incorporates comprehensive word
representations with multi-channel information and
Conditional Random Fields (CRF) into a traditional
Bidirectional Long Short-Term Memory (BiLSTM) neural network
without using any additional hand-craft features such as
gazetteers. In comparison with other systems participating in
the shared task, our system won the 2nd place."
}
@inproceedings{ghaddar-2018-robust-lexical-features,
title = "Robust Lexical Features for Improved Neural Network
Named-Entity Recognition",
author = "Ghaddar, Abbas and Langlais, Phillippe",
booktitle = "Proceedings of the 27th International Conference on
Computational Linguistics",
month = aug,
year = 2018,
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-1161",
pages = "1896-1907",
abstract = "Neural network approaches to Named-Entity Recognition reduce
the need for carefully hand-crafted features. While some
features do remain in state-of-the-art systems, lexical
features have been mostly discarded, with the exception of
gazetteers. In this work, we show that this is unfair:
lexical features are actually quite useful. We propose to
embed words and entity types into a low-dimensional vector
space we train from annotated data produced by distant
supervision thanks to Wikipedia. From this, we compute {---}
offline {---} a feature vector representing each word. When
used with a vanilla recurrent neural network model, this
representation yields substantial improvements. We establish
a new state-of-the-art F1 score of 87.95 on ONTONOTES 5.0,
while matching state-of-the-art performance with a F1 score
of 91.73 on the over-studied CONLL-2003 dataset."
}
@article{wu-2015-clinical-text-ner,
title = "Named entity recognition in Chinese clinical text using deep
neural network",
author = "Wu, Yonghui and Jiang, Min and Lei, Jianbo and Xu, Hua",
journal = "Studies in health technology and informatics",
volume = 216,
pages = 624,
year = 2015,
publisher = "NIH Public Access"
}
@incollection{zhou-2017-joint-extraction,
title = "Joint extraction of multiple relations and entities by using
a hybrid neural network",
author = "Zhou, Peng and Zheng, Suncong and Xu, Jiaming and Qi, Zhenyu
and Bao, Hongyun and Xu, Bo",
booktitle = "Chinese Computational Linguistics and Natural Language
Processing Based on Naturally Annotated Big Data",
pages = "135-146",
year = 2017,
publisher = "Springer"
}
@article{nguyen-2016-mention-detection-rnn,
author = "Thien Huu Nguyen and Avirup Sil and Georgiana Dinu and Radu
Florian",
title = "Toward Mention Detection Robustness with Recurrent Neural
Networks",
journal = "CoRR",
volume = "abs/1602.07749",
year = 2016,
url = "http://arxiv.org/abs/1602.07749",
archivePrefix= "arXiv",
eprint = "1602.07749",
timestamp = "Mon, 13 Aug 2018 16:48:51 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/NguyenSDF16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{zhai-2017-sequence-chunking,
author = "Feifei Zhai and Saloni Potdar and Bing Xiang and Bowen Zhou",
title = "Neural Models for Sequence Chunking",
journal = "CoRR",
volume = "abs/1701.04027",
year = 2017,
url = "http://arxiv.org/abs/1701.04027",
archivePrefix= "arXiv",
eprint = "1701.04027",
timestamp = "Mon, 13 Aug 2018 16:48:01 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ZhaiPXZ17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{zukov-gregoric-2018-ner-parallel-rnn,
title = "Named Entity Recognition With Parallel Recurrent Neural
Networks",
author = "{\v{Z}}ukov-Gregori{\v{c}}, Andrej and Bachrach, Yoram and
Coope, Sam",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-2012",
doi = "10.18653/v1/P18-2012",
pages = "69-74",
abstract = "We present a new architecture for named entity
recognition. Our model employs multiple independent
bidirectional LSTM units across the same input and promotes
diversity among them by employing an inter-model
regularization term. By distributing computation across
multiple smaller LSTMs we find a significant reduction in the
total number of parameters. We find our architecture achieves
state-of-the-art performance on the CoNLL 2003 NER dataset."
}
@inproceedings{rei-2017-semi-supervised-multitask,
title = "Semi-supervised Multitask Learning for Sequence Labeling",
author = "Rei, Marek",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2017,
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1194",
doi = "10.18653/v1/P17-1194",
pages = "2121-2130",
abstract = "We propose a sequence labeling framework with a secondary
training objective, learning to predict surrounding words for
every word in the dataset. This language modeling objective
incentivises the system to learn general-purpose patterns of
semantic and syntactic composition, which are also useful for
improving accuracy on different sequence labeling tasks. The
architecture was evaluated on a range of datasets, covering
the tasks of error detection in learner texts, named entity
recognition, chunking and POS-tagging. The novel language
modeling objective provided consistent performance
improvements on every benchmark, without requiring any
additional annotated or unannotated data."
}
@inproceedings{zhuo-2016-gated-recursive-semi-markov-crf,
title = "Segment-Level Sequence Modeling using Gated Recursive
Semi-{M}arkov Conditional Random Fields",
author = "Zhuo, Jingwei and Cao, Yong and Zhu, Jun and Zhang, Bo and
Nie, Zaiqing",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = 2016,
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P16-1134",
doi = "10.18653/v1/P16-1134",
pages = "1413-1423"
}
@inproceedings{ye-2018-hybrid-markov-crf,
title = "Hybrid semi-{M}arkov {CRF} for Neural Sequence Labeling",
author = "Ye, Zhixiu and Ling, Zhen-Hua",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-2038",
doi = "10.18653/v1/P18-2038",
pages = "235-240",
abstract = "This paper proposes hybrid semi-Markov conditional random
fields (SCRFs) for neural sequence labeling in natural
language processing. Based on conventional conditional random
fields (CRFs), SCRFs have been designed for the tasks of
assigning labels to segments by extracting features from and
describing transitions between segments instead of words. In
this paper, we improve the existing SCRF methods by employing
word-level and segment-level information
simultaneously. First, word-level labels are utilized to
derive the segment scores in SCRFs. Second, a CRF output
layer and an SCRF output layer are integrated into a unified
neural network and trained jointly. Experimental results on
CoNLL 2003 named entity recognition (NER) shared task show
that our model achieves state-of-the-art performance when no
external knowledge is used."
}
@inproceedings{aguilar-2017-multi-task-ner,
title = "A Multi-task Approach for Named Entity Recognition in Social
Media Data",
author = "Aguilar, Gustavo and Maharjan, Suraj and L{\'o}pez-Monroy,
Adrian Pastor and Solorio, Thamar",
booktitle = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-4419",
doi = "10.18653/v1/W17-4419",
pages = "148-153",
abstract = "Named Entity Recognition for social media data is challenging
because of its inherent noisiness. In addition to improper
grammatical structures, it contains spelling inconsistencies
and numerous informal abbreviations. We propose a novel
multi-task approach by employing a more general secondary
task of Named Entity (NE) segmentation together with the
primary task of fine-grained NE categorization. The
multi-task neural network architecture learns higher order
feature representations from word and character sequences
along with basic Part-of-Speech tags and gazetteer
information. This neural network acts as a feature extractor
to feed a Conditional Random Fields classifier. We were able
to obtain the first position in the 3rd Workshop on Noisy
User-generated Text (WNUT-2017) with a 41.86{\%} entity
F1-score and a 40.24{\%} surface F1-score."
}
@inproceedings{peng-2017-multi-task-sequence-tagging,
title = "Multi-task Domain Adaptation for Sequence Tagging",
author = "Peng, Nanyun and Dredze, Mark",
booktitle = "Proceedings of the 2nd Workshop on Representation Learning
for {NLP}",
month = aug,
year = 2017,
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-2612",
doi = "10.18653/v1/W17-2612",
pages = "91-100",
abstract = "Many domain adaptation approaches rely on learning cross
domain shared representations to transfer the knowledge
learned in one domain to other domains. Traditional domain
adaptation only considers adapting for one task. In this
paper, we explore multi-task representation learning under
the domain adaptation scenario. We propose a neural network
framework that supports domain adaptation for multiple tasks
simultaneously, and learns shared representations that better
generalize for domain adaptation. We apply the proposed
framework to domain adaptation for sequence tagging problems
considering two tasks: Chinese word segmentation and named
entity recognition. Experiments show that multi-task domain
adaptation works better than disjoint domain adaptation for
each task, and achieves the state-of-the-art results for both
tasks in the social media domain."
}
@article{pan-2013-transfer-joint-embedding,
title = "Transfer joint embedding for cross-domain named entity
recognition",
author = "Pan, Sinno Jialin and Toh, Zhiqiang and Su, Jian",
journal = "ACM Transactions on Information Systems (TOIS)",
volume = 31,
number = 2,
pages = 7,
year = 2013,
publisher = "ACM"
}
@inproceedings{qu-2016-ner-transfer-learning,
title = "Named Entity Recognition for Novel Types by Transfer
Learning",
author = "Qu, Lizhen and Ferraro, Gabriela and Zhou, Liyuan and Hou,
Weiwei and Baldwin, Timothy",
booktitle = "Proceedings of the 2016 Conference on Empirical Methods in
Natural Language Processing",
month = nov,
year = 2016,
address = "Austin, Texas",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D16-1087",
doi = "10.18653/v1/D16-1087",
pages = "899-905"
}
@article{yang-2017-transfer-learning-hierachical-rnn,
author = "Zhilin Yang and Ruslan Salakhutdinov and William W. Cohen",
title = "Transfer Learning for Sequence Tagging with Hierarchical
Recurrent Networks",
journal = "CoRR",
volume = "abs/1703.06345",
year = 2017,
url = "http://arxiv.org/abs/1703.06345",
archivePrefix= "arXiv",
eprint = "1703.06345",
timestamp = "Mon, 13 Aug 2018 16:48:14 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/YangSC17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{daeniken-2017-transfer-learning-ner,
title = "Transfer Learning and Sentence Level Features for Named
Entity Recognition on Tweets",
author = "von D{\"a}niken, Pius and Cieliebak, Mark",
booktitle = "Proceedings of the 3rd Workshop on Noisy User-generated Text",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-4422",
doi = "10.18653/v1/W17-4422",
pages = "166-171",
abstract = "We present our system for the WNUT 2017 Named Entity
Recognition challenge on Twitter data. We describe two
modifications of a basic neural network architecture for
sequence tagging. First, we show how we exploit additional
labeled data, where the Named Entity tags differ from the
target task. Then, we propose a way to incorporate sentence
level features. Our system uses both methods and ranked
second for entity level annotations, achieving an F1-score of
40.78, and second for surface form annotations, achieving an
F1-score of 39.33."
}
@inproceedings{zhao-2018-multi-task-data-selection,
title = "Improve Neural Entity Recognition via Multi-Task Data
Selection and Constrained Decoding",
author = "Zhao, Huasha and Yang, Yi and Zhang, Qiong and Si, Luo",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 2 (Short Papers)",
month = jun,
year = 2018,
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-2056",
doi = "10.18653/v1/N18-2056",
pages = "346-351",
abstract = "Entity recognition is a widely benchmarked task in natural
language processing due to its massive applications. The
state-of-the-art solution applies a neural architecture named
BiLSTM-CRF to model the language sequences. In this paper, we
propose an entity recognition system that improves this
neural architecture with two novel techniques. The first
technique is Multi-Task Data Selection, which ensures the
consistency of data distribution and labeling guidelines
between source and target datasets. The other one is
constrained decoding using knowledge base. The decoder of the
model operates at the document level, and leverages global
and external information sources to further improve
performance. Extensive experiments have been conducted to
show the advantages of each technique. Our system achieves
state-of-the-art results on the English entity recognition
task in KBP 2017 official evaluation, and it also yields very
strong results in other languages."
}
@inproceedings{lin-2018-neural-adaptation-layers,
title = "Neural Adaptation Layers for Cross-domain Named Entity
Recognition",
author = "Lin, Bill Yuchen and Lu, Wei",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1226",
doi = "10.18653/v1/D18-1226",
pages = "2012-2022",
abstract = "Recent research efforts have shown that neural architectures
can be effective in conventional information extraction tasks
such as named entity recognition, yielding state-of-the-art
results on standard newswire datasets. However, despite
significant resources required for training such models, the
performance of a model trained on one domain typically
degrades dramatically when applied to a different domain, yet
extracting entities from new emerging domains such as social
media can be of significant interest. In this paper, we
empirically investigate effective methods for conveniently
adapting an existing, well-trained neural NER model for a new
domain. Unlike existing approaches, we propose lightweight
yet effective methods for performing domain adaptation for
neural models. Specifically, we introduce adaptation layers
on top of existing neural architectures, where no re-training
using the source domain data is required. We conduct
extensive empirical studies and show that our approach
significantly outperforms state-of-the-art methods."
}
@article{shen-2017-deep-active-learning,
author = "Yanyao Shen and Hyokun Yun and Zachary C. Lipton and Yakov
Kronrod and Animashree Anandkumar",
title = "Deep Active Learning for Named Entity Recognition",
journal = "CoRR",
volume = "abs/1707.05928",
year = 2017,
url = "http://arxiv.org/abs/1707.05928",
archivePrefix= "arXiv",
eprint = "1707.05928",
timestamp = "Mon, 13 Aug 2018 16:47:29 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/ShenYLKA17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{narasimhan-2016-ie-reinforcement-learning,
author = "Karthik Narasimhan and Adam Yala and Regina Barzilay",
title = "Improving Information Extraction by Acquiring External
Evidence with Reinforcement Learning",
journal = "CoRR",
volume = "abs/1603.07954",
year = 2016,
url = "http://arxiv.org/abs/1603.07954",
archivePrefix= "arXiv",
eprint = "1603.07954",
timestamp = "Mon, 13 Aug 2018 16:48:30 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/NarasimhanYB16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{zhou-2019-datnet,
title = "Dual Adversarial Neural Transfer for Low-Resource Named
Entity Recognition",
author = "Zhou, Joey Tianyi and Zhang, Hao and Jin, Di and Zhu,
Hongyuan and Fang, Meng and Goh, Rick Siow Mong and Kwok,
Kenneth",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1336",
doi = "10.18653/v1/P19-1336",
pages = "3461-3471",
abstract = "We propose a new neural transfer method termed Dual
Adversarial Transfer Network (DATNet) for addressing
low-resource Named Entity Recognition (NER). Specifically,
two variants of DATNet, i.e., DATNet-F and DATNet-P, are
investigated to explore effective feature fusion between high
and low resource. To address the noisy and imbalanced
training data, we propose a novel Generalized
Resource-Adversarial Discriminator (GRAD). Additionally,
adversarial training is adopted to boost model
generalization. In experiments, we examine the effects of
different components in DATNet across domains and languages
and show that significant improvement can be obtained
especially for low-resource data, without augmenting any
additional hand-crafted features and pre-trained language
model."
}
@inproceedings{zukov-gregoric-2017-ner-self-attention,
title = "Neural named entity recognition using a self-attention
mechanism",
author = "Zukov-Gregoric, Andrej and Bachrach, Yoram and Minkovsky,
Pasha and Coope, Sam and Maksak, Bogdan",
booktitle = "2017 IEEE 29th International Conference on Tools with
Artificial Intelligence (ICTAI)",
pages = "652-656",
year = 2017,
organization = "IEEE"
}
@inproceedings{xu-2018-ner-global-attention,
title = "Improving clinical named entity recognition with global
neural attention",
author = "Xu, Guohai and Wang, Chengyu and He, Xiaofeng",
booktitle = "Asia-Pacific Web (APWeb) and Web-Age Information Management
(WAIM) Joint International Conference on Web and Big Data",
pages = "264-279",
year = 2018,
organization = "Springer"
}
@article{li-2018-survey-nlp,
author = "Jing Li and Aixin Sun and Jianglei Han and Chenliang Li",
title = "A Survey on Deep Learning for Named Entity Recognition",
journal = "CoRR",
volume = "abs/1812.09449",
year = 2018,
url = "http://arxiv.org/abs/1812.09449",
archivePrefix= "arXiv",
eprint = "1812.09449",
timestamp = "Mon, 28 Jan 2019 16:41:27 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1812-09449",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{zhu-2019-can-ner,
title = "{CAN}-{NER}: {C}onvolutional {A}ttention {N}etwork for
{C}hinese {N}amed {E}ntity {R}ecognition",
author = "Zhu, Yuying and Wang, Guoxin",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long and Short
Papers)",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N19-1342",
doi = "10.18653/v1/N19-1342",
pages = "3384-3393",
abstract = "Named entity recognition (NER) in Chinese is essential but
difficult because of the lack of natural
delimiters. Therefore, Chinese Word Segmentation (CWS) is
usually considered as the first step for Chinese
NER. However, models based on word-level embeddings and
lexicon features often suffer from segmentation errors and
out-of-vocabulary (OOV) words. In this paper, we investigate
a Convolutional Attention Network called CAN for Chinese NER,
which consists of a character-based convolutional neural
network (CNN) with local-attention layer and a gated
recurrent unit (GRU) with global self-attention layer to
capture the information from adjacent characters and sentence
contexts. Also, compared to other models, not depending on
any external resources like lexicons and employing small size
of char embeddings make our model more practical. Extensive
experimental results show that our approach outperforms
state-of-the-art methods without word embedding and external
lexicon resources on different domain datasets including
Weibo, MSRA and Chinese Resume NER dataset."
}
@inproceedings{guan-2019-bert-lstm-crf,
title = "New Research on Transfer Learning Model of Named Entity
Recognition",
author = "Guan, Guoliang and Zhu, Min",
booktitle = "Journal of Physics: Conference Series",
volume = 1267,
number = 1,
pages = 012017,
year = 2019,
organization = "IOP Publishing"
}
@inproceedings{arkhipov-2019-multilingual-transforms,
title = "Tuning multilingual transformers for language-specific named
entity recognition",
author = "Arkhipov, Mikhail and Trofimova, Maria and Kuratov, Yurii and
Sorokin, Alexey",
booktitle = "Proceedings of the 7th Workshop on Balto-Slavic Natural
Language Processing",
pages = "89-93",
year = 2019
}
@ARTICLE{zadeh-2019-fmt,
author = "{Zadeh}, Amir and {Mao}, Chengfeng and {Shi}, Kelly and
{Zhang}, Yiwei and {Liang}, Paul Pu and {Poria}, Soujanya and
{Morency}, Louis-Philippe",
title = "{Factorized Multimodal Transformer for Multimodal Sequential
Learning}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Computation and Language, Statistics - Machine Learning",
year = 2019,
month = "Nov",
eid = "arXiv:1911.09826",
pages = "arXiv:1911.09826",
archivePrefix= "arXiv",
eprint = "1911.09826",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv191109826Z",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{liu-2017-lm-lstm-crf,
author = "Liyuan Liu and Jingbo Shang and Frank F. Xu and Xiang Ren and
Huan Gui and Jian Peng and Jiawei Han",
title = "Empower Sequence Labeling with Task-Aware Neural Language
Model",
journal = "CoRR",
volume = "abs/1709.04109",
year = 2017,
url = "http://arxiv.org/abs/1709.04109",
archivePrefix= "arXiv",
eprint = "1709.04109",
timestamp = "Mon, 13 Aug 2018 16:47:53 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1709-04109",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{liu-2018-lm-pruning,
title = "Efficient Contextualized Representation: Language Model
Pruning for Sequence Labeling",
author = "Liu, Liyuan and Ren, Xiang and Shang, Jingbo and Gu, Xiaotao
and Peng, Jian and Han, Jiawei",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1153",
doi = "10.18653/v1/D18-1153",
pages = "1215-1225",
abstract = "Many efforts have been made to facilitate natural language
processing tasks with pre-trained language models (LMs), and
brought significant improvements to various applications. To
fully leverage the nearly unlimited corpora and capture
linguistic information of multifarious levels, large-size LMs
are required; but for a specific task, only parts of these
information are useful. Such large-sized LMs, even in the
inference stage, may cause heavy computation workloads,
making them too time-consuming for large-scale
applications. Here we propose to compress bulky LMs while
preserving useful information with regard to a specific
task. As different layers of the model keep different
information, we develop a layer selection method for model
pruning using sparsity-inducing regularization. By
introducing the dense connectivity, we can detach any layer
without affecting others, and stretch shallow and wide LMs to
be deep and narrow. In model training, LMs are learned with
layer-wise dropouts for better robustness. Experiments on two
benchmark datasets demonstrate the effectiveness of our
method."
}
@article{liu-2018-non-local-nn,
author = "Pengfei Liu and Shuaichen Chang and Xuanjing Huang and Jian
Tang and Jackie Chi Kit Cheung",
title = "Contextualized Non-local Neural Networks for Sequence
Learning",
journal = "CoRR",
volume = "abs/1811.08600",
year = 2018,
url = "http://arxiv.org/abs/1811.08600",
archivePrefix= "arXiv",
eprint = "1811.08600",
timestamp = "Mon, 26 Nov 2018 12:52:45 +0100",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1811-08600",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{chen-2019-grn,
author = "Hui Chen and Zijia Lin and Guiguang Ding and Jianguang Lou
and Yusen Zhang and B{\"{o}}rje Karlsson",
title = "{GRN:} Gated Relation Network to Enhance Convolutional Neural
Network for Named Entity Recognition",
journal = "CoRR",
volume = "abs/1907.05611",
year = 2019,
url = "http://arxiv.org/abs/1907.05611",
archivePrefix= "arXiv",
eprint = "1907.05611",
timestamp = "Thu, 10 Oct 2019 11:51:45 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1907-05611",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{guo-2019-star-transformer,
author = "Qipeng Guo and Xipeng Qiu and Pengfei Liu and Yunfan Shao and
Xiangyang Xue and Zheng Zhang",
title = "Star-Transformer",
journal = "CoRR",
volume = "abs/1902.09113",
year = 2019,
url = "http://arxiv.org/abs/1902.09113",
archivePrefix= "arXiv",
eprint = "1902.09113",
timestamp = "Tue, 21 May 2019 18:03:39 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1902-09113",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{yan-2019-tener,
author = "{Yan}, Hang and {Deng}, Bocao and {Li}, Xiaonan and {Qiu},
Xipeng",
title = "{TENER: Adapting Transformer Encoder for Named Entity
Recognition}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Machine Learning",
year = 2019,
month = "Nov",
eid = "arXiv:1911.04474",
pages = "arXiv:1911.04474",
archivePrefix= "arXiv",
eprint = "1911.04474",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv191104474Y",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{xu-2020-cluener,
author = "{Xu}, Liang and {tong}, Yu and {Dong}, Qianqian and {Liao},
Yixuan and {Yu}, Cong and {Tian}, Yin and {Liu}, Weitang and
{Li}, Lu and {Liu}, Caiquan and {Zhang}, Xuanwei",
title = "{CLUENER2020: Fine-grained Named Entity Recognition Dataset
and Benchmark for Chinese}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Information Retrieval, Computer Science - Machine Learning",
year = 2020,
month = "Jan",
eid = "arXiv:2001.04351",
pages = "arXiv:2001.04351",
archivePrefix= "arXiv",
eprint = "2001.04351",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2020arXiv200104351X",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{crichton-2017-multi-task-bio-ner,
title = "A neural network multi-task learning approach to biomedical
named entity recognition",
author = "Crichton, Gamal and Pyysalo, Sampo and Chiu, Billy and
Korhonen, Anna",
journal = "BMC bioinformatics",
volume = 18,
number = 1,
pages = 368,
year = 2017,
publisher = "BioMed Central"
}
@ARTICLE{li-2015-ggs-nn,
author = "{Li}, Yujia and {Tarlow}, Daniel and {Brockschmidt}, Marc and
{Zemel}, Richard",
title = "{Gated Graph Sequence Neural Networks}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Artificial Intelligence, Computer Science - Neural and
Evolutionary Computing, Statistics - Machine Learning",
year = 2015,
month = "Nov",
eid = "arXiv:1511.05493",
pages = "arXiv:1511.05493",
archivePrefix= "arXiv",
eprint = "1511.05493",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2015arXiv151105493L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{battaglia-2018-gcn,
author = "{Battaglia}, Peter W. and {Hamrick}, Jessica B. and {Bapst},
Victor and {Sanchez-Gonzalez}, Alvaro and {Zambaldi},
Vinicius and {Malinowski}, Mateusz and {Tacchetti}, Andrea
and {Raposo}, David and {Santoro}, Adam and {Faulkner}, Ryan
and {Gulcehre}, Caglar and {Song}, Francis and {Ballard},
Andrew and {Gilmer}, Justin and {Dahl}, George and {Vaswani},
Ashish and {Allen}, Kelsey and {Nash}, Charles and
{Langston}, Victoria and {Dyer}, Chris and {Heess}, Nicolas
and {Wierstra}, Daan and {Kohli}, Pushmeet and {Botvinick},
Matt and {Vinyals}, Oriol and {Li}, Yujia and {Pascanu},
Razvan",
title = "{Relational inductive biases, deep learning, and graph
networks}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Artificial Intelligence, Statistics - Machine Learning",
year = 2018,
month = "Jun",
eid = "arXiv:1806.01261",
pages = "arXiv:1806.01261",
archivePrefix= "arXiv",
eprint = "1806.01261",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2018arXiv180601261B",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{limsopatham-2016-bi-lstm-twitter,
title = "Bidirectional {LSTM} for Named Entity Recognition in Twitter
Messages",
author = "Limsopatham, Nut and Collier, Nigel",
booktitle = "Proceedings of the 2nd Workshop on Noisy User-generated Text
({WNUT})",
month = dec,
year = 2016,
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://www.aclweb.org/anthology/W16-3920",
pages = "145-152",
abstract = "In this paper, we present our approach for named entity
recognition in Twitter messages that we used in our
participation in the Named Entity Recognition in Twitter
shared task at the COLING 2016 Workshop on Noisy
User-generated text (WNUT). The main challenge that we aim to
tackle in our participation is the short, noisy and
colloquial nature of tweets, which makes named entity
recognition in Twitter message a challenging task. In
particular, we investigate an approach for dealing with this
problem by enabling bidirectional long short-term memory
(LSTM) to automatically learn orthographic features without
requiring feature engineering. In comparison with other
systems participating in the shared task, our system achieved
the most effective performance on both the {`}segmentation
and categorisation{'} and the {`}segmentation only{'}
sub-tasks."
}
@incollection{sarawagi-2005-scrf,
title = "Semi-Markov Conditional Random Fields for Information
Extraction",
author = "Sunita Sarawagi and Cohen, William W",
booktitle = "Advances in Neural Information Processing Systems 17",
editor = "L. K. Saul and Y. Weiss and L. Bottou",
pages = "1185-1192",
year = 2005,
publisher = "MIT Press",
url =
"http://papers.nips.cc/paper/2648-semi-markov-conditional-random-fields-for-information-extraction.pdf"
}
@article{nadeau-2007-survey-ner,
title = "A survey of named entity recognition and classification",
author = "Nadeau, David and Sekine, Satoshi",
journal = "Lingvisticae Investigationes",
volume = 30,
number = 1,
pages = "3-26",
year = 2007,
publisher = "John Benjamins"
}
@article{夏光辉-2015-基于实体词典与机器学习的基因命名实体识别,
title = "基于实体词典与机器学习的基因命名实体识别",
author = "夏光辉 and 李军莲 and 阮学平",
journal = "医学信息学杂志",
number = 12,
pages = "54-60",
year = 2015
}
@inproceedings{wu-2018-eval-sl-features,
title = "Evaluating the Utility of Hand-crafted Features in Sequence
Labelling",
author = "Wu, Minghao and Liu, Fei and Cohn, Trevor",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1310",
doi = "10.18653/v1/D18-1310",
pages = "2850-2856",
abstract = "Conventional wisdom is that hand-crafted features are
redundant for deep learning models, as they already learn
adequate representations of text automatically from
corpora. In this work, we test this claim by proposing a new
method for exploiting handcrafted features as part of a novel
hybrid learning approach, incorporating a feature
auto-encoder loss component. We evaluate on the task of named
entity recognition (NER), where we show that including manual
features for part-of-speech, word shapes and gazetteers can
improve the performance of a neural CRF model. We obtain a F
1 of 91.89 for the CoNLL-2003 English shared task, which
significantly outperforms a collection of highly competitive
baseline models. We also present an ablation study showing
the importance of auto-encoding, over using features as
either inputs or outputs alone, and moreover, show including
the autoencoder components reduces training requirements to
60{\%}, while retaining the same predictive accuracy."
}
@inproceedings{zhang-2018-adapt-co-attention-ner,
title = "Adaptive co-attention network for named entity recognition in
tweets",
author = "Zhang, Qi and Fu, Jinlan and Liu, Xiaoyu and Huang, Xuanjing",
booktitle = "Thirty-Second AAAI Conference on Artificial Intelligence",
year = 2018
}
@inproceedings{greenberg-2018-disjoint-label-sets-ner,
title = "Marginal Likelihood Training of {B}i{LSTM}-{CRF} for
Biomedical Named Entity Recognition from Disjoint Label Sets",
author = "Greenberg, Nathan and Bansal, Trapit and Verga, Patrick and
McCallum, Andrew",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1306",
doi = "10.18653/v1/D18-1306",
pages = "2824-2829",
abstract = "Extracting typed entity mentions from text is a fundamental
component to language understanding and reasoning. While
there exist substantial labeled text datasets for multiple
subsets of biomedical entity types{---}such as genes and
proteins, or chemicals and diseases{---}it is rare to find
large labeled datasets containing labels for all desired
entity types together. This paper presents a method for
training a single CRF extractor from multiple datasets with
disjoint or partially overlapping sets of entity types. Our
approach employs marginal likelihood training to insist on
labels that are present in the data, while filling in
{``}missing labels{''}. This allows us to leverage all the
available data within a single model. In experimental results
on the Biocreative V CDR (chemicals/diseases), Biocreative VI
ChemProt (chemicals/proteins) and MedMentions (19 entity
types) datasets, we show that joint training on multiple
datasets improves NER F1 over training in isolation, and our
methods achieve state-of-the-art results."
}
@inproceedings{cao-2018-adv-ner,
title = "Adversarial Transfer Learning for {C}hinese Named Entity
Recognition with Self-Attention Mechanism",
author = "Cao, Pengfei and Chen, Yubo and Liu, Kang and Zhao, Jun and
Liu, Shengping",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1017",
doi = "10.18653/v1/D18-1017",
pages = "182-192",
abstract = "Named entity recognition (NER) is an important task in
natural language processing area, which needs to determine
entities boundaries and classify them into pre-defined
categories. For Chinese NER task, there is only a very small
amount of annotated data available. Chinese NER task and
Chinese word segmentation (CWS) task have many similar word
boundaries. There are also specificities in each
task. However, existing methods for Chinese NER either do not
exploit word boundary information from CWS or cannot filter
the specific information of CWS. In this paper, we propose a
novel adversarial transfer learning framework to make full
use of task-shared boundaries information and prevent the
task-specific features of CWS. Besides, since arbitrary
character can provide important cues when predicting entity
type, we exploit self-attention to explicitly capture long
range dependencies between two tokens. Experimental results
on two different widely used datasets show that our proposed
model significantly and consistently outperforms other
state-of-the-art methods."
}
@inproceedings{yu-2018-char-lm-ner,
title = "On the Strength of Character Language Models for Multilingual
Named Entity Recognition",
author = "Yu, Xiaodong and Mayhew, Stephen and Sammons, Mark and Roth,
Dan",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1345",
doi = "10.18653/v1/D18-1345",
pages = "3073-3077",
abstract = "Character-level patterns have been widely used as features in
English Named Entity Recognition (NER) systems. However, to
date there has been no direct investigation of the inherent
differences between name and nonname tokens in text, nor
whether this property holds across multiple languages. This
paper analyzes the capabilities of corpus-agnostic
Character-level Language Models (CLMs) in the binary task of
distinguishing name tokens from non-name tokens. We
demonstrate that CLMs provide a simple and powerful model for
capturing these differences, identifying named entity tokens
in a diverse set of languages at close to the performance of
full NER systems. Moreover, by adding very simple CLM-based
features we can significantly improve the performance of an
off-the-shelf NER system for multiple languages."
}
@article{savarese-2016-residual-gates,
author = "Pedro H. P. Savarese",
title = "Learning Identity Mappings with Residual Gates",
journal = "CoRR",
volume = "abs/1611.01260",
year = 2016,
url = "http://arxiv.org/abs/1611.01260",
archivePrefix= "arXiv",
eprint = "1611.01260",
timestamp = "Mon, 13 Aug 2018 16:48:22 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/Savarese16",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{weiss-2016-survey-transfer-learning,
title = "A survey of transfer learning",
author = "Weiss, Karl and Khoshgoftaar, Taghi M and Wang, DingDing",
journal = "Journal of Big data",
volume = 3,
number = 1,
pages = 9,
year = 2016,
publisher = "SpringerOpen"
}
@ARTICLE{thulasidasan-2019-dac-loss,
author = "{Thulasidasan}, Sunil and {Bhattacharya}, Tanmoy and
{Bilmes}, Jeff and {Chennupati}, Gopinath and {Mohd-Yusof},
Jamal",
title = "{Combating Label Noise in Deep Learning Using Abstention}",
journal = "arXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Machine
Learning",
year = 2019,
month = may,
eid = "arXiv:1905.10964",
pages = "arXiv:1905.10964",
archivePrefix= "arXiv",
eprint = "1905.10964",
primaryClass = "stat.ML",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190510964T",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{lin-2017-focal-loss,
author = "Tsung{-}Yi Lin and Priya Goyal and Ross B. Girshick and
Kaiming He and Piotr Doll{\'{a}}r",
title = "Focal Loss for Dense Object Detection",
journal = "CoRR",
volume = "abs/1708.02002",
year = 2017,
url = "http://arxiv.org/abs/1708.02002",
archivePrefix= "arXiv",
eprint = "1708.02002",
timestamp = "Mon, 13 Aug 2018 16:46:12 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1708-02002.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{shrivastava-2016-ohem,
author = "{Shrivastava}, Abhinav and {Gupta}, Abhinav and {Girshick},
Ross",
title = "{Training Region-based Object Detectors with Online Hard
Example Mining}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computer Vision and Pattern Recognition,
Computer Science - Machine Learning",
year = 2016,
month = apr,
eid = "arXiv:1604.03540",
pages = "arXiv:1604.03540",
archivePrefix= "arXiv",
eprint = "1604.03540",
primaryClass = "cs.CV",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160403540S",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{zhou-2017-east,
author = "Xinyu Zhou and Cong Yao and He Wen and Yuzhi Wang and
Shuchang Zhou and Weiran He and Jiajun Liang",
title = "{EAST:} An Efficient and Accurate Scene Text Detector",
journal = "CoRR",
volume = "abs/1704.03155",
year = 2017,
url = "http://arxiv.org/abs/1704.03155",
archivePrefix= "arXiv",
eprint = "1704.03155",
timestamp = "Mon, 13 Aug 2018 16:48:38 +0200",
biburl = "https://dblp.org/rec/journals/corr/ZhouYWWZHL17.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{xie-2015-bce-loss,
author = "Saining Xie and Zhuowen Tu",
title = "Holistically-Nested Edge Detection",
journal = "CoRR",
volume = "abs/1504.06375",
year = 2015,
url = "http://arxiv.org/abs/1504.06375",
archivePrefix= "arXiv",
eprint = "1504.06375",
timestamp = "Mon, 13 Aug 2018 16:46:00 +0200",
biburl = "https://dblp.org/rec/journals/corr/XieT15.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{sokolova-2009-measure-analysis,
title = "A systematic analysis of performance measures for
classification tasks",
journal = "Information Processing \& Management",
volume = 45,
number = 4,
pages = "427-437",
year = 2009,
issn = "0306-4573",
doi = "https://doi.org/10.1016/j.ipm.2009.03.002",
url =
"http://www.sciencedirect.com/science/article/pii/S0306457309000259",
author = "Marina Sokolova and Guy Lapalme",
keywords = "Performance evaluation, Machine Learning, Text
classification",
abstract = "This paper presents a systematic analysis of twenty four
performance measures used in the complete spectrum of Machine
Learning classification tasks, i.e., binary, multi-class,
multi-labelled, and hierarchical. For each classification
task, the study relates a set of changes in a confusion
matrix to specific characteristics of data. Then the analysis
concentrates on the type of changes to a confusion matrix
that do not change a measure, therefore, preserve a
classifier’s evaluation (measure invariance). The result is
the measure invariance taxonomy with respect to all relevant
label distribution changes in a classification problem. This
formal analysis is supported by examples of applications
where invariance properties of measures lead to a more
reliable evaluation of classifiers. Text classification
supplements the discussion with several case studies."
}
@inproceedings{yang-2018-sgm,
title = "{SGM}: Sequence Generation Model for Multi-label
Classification",
author = "Yang, Pengcheng and Sun, Xu and Li, Wei and Ma, Shuming and
Wu, Wei and Wang, Houfeng",
booktitle = "Proceedings of the 27th International Conference on
Computational Linguistics",
month = aug,
year = 2018,
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-1330",
pages = "3915-3926",
abstract = "Multi-label classification is an important yet challenging
task in natural language processing. It is more complex than
single-label classification in that the labels tend to be
correlated. Existing methods tend to ignore the correlations
between labels. Besides, different parts of the text can
contribute differently for predicting different labels, which
is not considered by existing models. In this paper, we
propose to view the multi-label classification task as a
sequence generation problem, and apply a sequence generation
model with a novel decoder structure to solve it. Extensive
experimental results show that our proposed methods
outperform previous work by a substantial margin. Further
analysis of experimental results demonstrates that the
proposed methods not only capture the correlations between
labels, but also select the most informative words
automatically when predicting different labels."
}
@article{tay-2018-cafe,
author = "Yi Tay and Luu Anh Tuan and Siu Cheung Hui",
title = "A Compare-Propagate Architecture with Alignment Factorization
for Natural Language Inference",
journal = "CoRR",
volume = "abs/1801.00102",
year = 2018,
url = "http://arxiv.org/abs/1801.00102",
archivePrefix= "arXiv",
eprint = "1801.00102",
timestamp = "Mon, 13 Aug 2018 16:47:31 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1801-00102.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{lan-2019-albert,
author = "{Lan}, Zhenzhong and {Chen}, Mingda and {Goodman}, Sebastian
and {Gimpel}, Kevin and {Sharma}, Piyush and {Soricut}, Radu",
title = "{ALBERT: A Lite BERT for Self-supervised Learning of Language
Representations}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence",
year = 2019,
month = sep,
eid = "arXiv:1909.11942",
pages = "arXiv:1909.11942",
archivePrefix= "arXiv",
eprint = "1909.11942",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190911942L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{jiao-2019-tinybert,
author = "{Jiao}, Xiaoqi and {Yin}, Yichun and {Shang}, Lifeng and
{Jiang}, Xin and {Chen}, Xiao and {Li}, Linlin and {Wang},
Fang and {Liu}, Qun",
title = "{TinyBERT: Distilling BERT for Natural Language
Understanding}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Machine
Learning",
year = 2019,
month = sep,
eid = "arXiv:1909.10351",
pages = "arXiv:1909.10351",
archivePrefix= "arXiv",
eprint = "1909.10351",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190910351J",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{joshi-2019-spanbert,
author = "Mandar Joshi and Danqi Chen and Yinhan Liu and Daniel S. Weld
and Luke Zettlemoyer and Omer Levy",
title = "SpanBERT: Improving Pre-training by Representing and
Predicting Spans",
journal = "CoRR",
volume = "abs/1907.10529",
year = 2019,
url = "http://arxiv.org/abs/1907.10529",
archivePrefix= "arXiv",
eprint = "1907.10529",
timestamp = "Thu, 01 Aug 2019 08:59:33 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1907-10529.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{radford-2019-gpt2,
title = {Language Models are Unsupervised Multitask Learners},
author = {Alec Radford and Jeffrey Wu and Rewon Child and David Luan
and Dario Amodei and Ilya Sutskever},
year = 2019
}
@article{boutell-2004-binary-relevance,
title = "Learning multi-label scene classification",
journal = "Pattern Recognition",
volume = 37,
number = 9,
pages = "1757-1771",
year = 2004,
issn = "0031-3203",
doi = "https://doi.org/10.1016/j.patcog.2004.03.009",
url =
"http://www.sciencedirect.com/science/article/pii/S0031320304001074",
author = "Matthew R. Boutell and Jiebo Luo and Xipeng Shen and
Christopher M. Brown",
keywords = "Image understanding, Semantic scene classification,
Multi-label classification, Multi-label training, Multi-label
evaluation, Image organization, Cross-training, Jaccard
similarity",
abstract = "In classic pattern recognition problems, classes are mutually
exclusive by definition. Classification errors occur when the
classes overlap in the feature space. We examine a different
situation, occurring when the classes are, by definition, not
mutually exclusive. Such problems arise in semantic scene and
document classification and in medical diagnosis. We present
a framework to handle such problems and apply it to the
problem of semantic scene classification, where a natural
scene may contain multiple objects such that the scene can be
described by multiple class labels (e.g., a field scene with
a mountain in the background). Such a problem poses
challenges to the classic pattern recognition paradigm and
demands a different treatment. We discuss approaches for
training and testing in this scenario and introduce new
metrics for evaluating individual examples, class recall and
precision, and overall accuracy. Experiments show that our
methods are suitable for scene classification; furthermore,
our work appears to generalize to other classification
problems of the same nature."
}
@inproceedings{read-2009-classifier-chains,
title = "Classifier chains for multi-label classification",
author = "Read, Jesse and Pfahringer, Bernhard and Holmes, Geoff and
Frank, Eibe",
booktitle = "Joint European Conference on Machine Learning and Knowledge
Discovery in Databases",
pages = "254-269",
year = 2009,
organization = "Springer"
}
@inproceedings{zhang-2019-ernie,
title = "{ERNIE}: Enhanced Language Representation with Informative
Entities",
author = "Zhang, Zhengyan and Han, Xu and Liu, Zhiyuan and Jiang, Xin
and Sun, Maosong and Liu, Qun",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1139",
doi = "10.18653/v1/P19-1139",
pages = "1441-1451",
abstract = "Neural language representation models such as BERT
pre-trained on large-scale corpora can well capture rich
semantic patterns from plain text, and be fine-tuned to
consistently improve the performance of various NLP
tasks. However, the existing pre-trained language models
rarely consider incorporating knowledge graphs (KGs), which
can provide rich structured knowledge facts for better
language understanding. We argue that informative entities in
KGs can enhance language representation with external
knowledge. In this paper, we utilize both large-scale textual
corpora and KGs to train an enhanced language representation
model (ERNIE), which can take full advantage of lexical,
syntactic, and knowledge information simultaneously. The
experimental results have demonstrated that ERNIE achieves
significant improvements on various knowledge-driven tasks,
and meanwhile is comparable with the state-of-the-art model
BERT on other common NLP tasks. The code and datasets will be
available in the future."
}
@article{sun-2019-ernie2,
author = "Yu Sun and Shuohuan Wang and Yu{-}Kun Li and Shikun Feng and
Hao Tian and Hua Wu and Haifeng Wang",
title = "{ERNIE} 2.0: {A} Continual Pre-training Framework for
Language Understanding",
journal = "CoRR",
volume = "abs/1907.12412",
year = 2019,
url = "http://arxiv.org/abs/1907.12412",
archivePrefix= "arXiv",
eprint = "1907.12412",
timestamp = "Tue, 21 Jan 2020 07:56:31 +0100",
biburl = "https://dblp.org/rec/journals/corr/abs-1907-12412.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{dong-2019-unilm,
author = "Li Dong and Nan Yang and Wenhui Wang and Furu Wei and
Xiaodong Liu and Yu Wang and Jianfeng Gao and Ming Zhou and
Hsiao{-}Wuen Hon",
title = "Unified Language Model Pre-training for Natural Language
Understanding and Generation",
journal = "CoRR",
volume = "abs/1905.03197",
year = 2019,
url = "http://arxiv.org/abs/1905.03197",
archivePrefix= "arXiv",
eprint = "1905.03197",
timestamp = "Wed, 19 Feb 2020 17:11:34 +0100",
biburl = "https://dblp.org/rec/journals/corr/abs-1905-03197.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{clark-2019-electra,
title = "ELECTRA: Pre-training Text Encoders as Discriminators Rather
Than Generators",
author = "Clark, Kevin and Luong, Minh-Thang and Le, Quoc V and
Manning, Christopher D",
booktitle = "International Conference on Learning Representations",
year = 2019
}
@article{liu-2019-mt-dnn,
author = "Xiaodong Liu and Pengcheng He and Weizhu Chen and Jianfeng
Gao",
title = "Multi-Task Deep Neural Networks for Natural Language
Understanding",
journal = "CoRR",
volume = "abs/1901.11504",
year = 2019,
url = "http://arxiv.org/abs/1901.11504",
archivePrefix= "arXiv",
eprint = "1901.11504",
timestamp = "Mon, 04 Feb 2019 08:11:03 +0100",
biburl = "https://dblp.org/rec/journals/corr/abs-1901-11504.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{yang-2019-xlnet,
author = "Zhilin Yang and Zihang Dai and Yiming Yang and Jaime
G. Carbonell and Ruslan Salakhutdinov and Quoc V. Le",
title = "XLNet: Generalized Autoregressive Pretraining for Language
Understanding",
journal = "CoRR",
volume = "abs/1906.08237",
year = 2019,
url = "http://arxiv.org/abs/1906.08237",
archivePrefix= "arXiv",
eprint = "1906.08237",
timestamp = "Mon, 24 Jun 2019 17:28:45 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1906-08237.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{cui-2019-bert-wwm,
author = "{Cui}, Yiming and {Che}, Wanxiang and {Liu}, Ting and {Qin},
Bing and {Yang}, Ziqing and {Wang}, Shijin and {Hu}, Guoping",
title = "{Pre-Training with Whole Word Masking for Chinese BERT}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Machine Learning",
year = 2019,
month = jun,
eid = "arXiv:1906.08101",
pages = "arXiv:1906.08101",
archivePrefix= "arXiv",
eprint = "1906.08101",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190608101C",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{socher-2011-rnn,
title = "Parsing natural scenes and natural language with recursive
neural networks",
author = "Socher, Richard and Lin, Cliff C and Manning, Chris and Ng,
Andrew Y",
booktitle = "Proceedings of the 28th international conference on machine
learning (ICML-11)",
pages = "129-136",
year = 2011
}
@inproceedings{socher-2013-sentiment-treebank,
title = "Recursive Deep Models for Semantic Compositionality Over a
Sentiment Treebank",
author = "Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang,
Jason and Manning, Christopher D. and Ng, Andrew and Potts,
Christopher",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in
Natural Language Processing",
month = oct,
year = 2013,
address = "Seattle, Washington, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D13-1170",
pages = "1631-1642"
}
@article{pollack-1990-raam,
title = "Recursive distributed representations",
journal = "Artificial Intelligence",
volume = 46,
number = 1,
pages = "77-105",
year = 1990,
issn = "0004-3702",
doi = "https://doi.org/10.1016/0004-3702(90)90005-K",
url =
"http://www.sciencedirect.com/science/article/pii/000437029090005K",
author = "Jordan B. Pollack",
abstract = "A longstanding difficulty for connectionist modeling has been
how to represent variable-sized recursive data structures,
such as trees and lists, in fixed-width patterns. This paper
presents a connectionist architecture which automatically
develops compact distributed representations for such
compositional structures, as well as efficient accessing
mechanisms for them. Patterns which stand for the internal
nodes of fixed-valence trees are devised through the
recursive use of backpropagation on three-layer
auto-associative encoder networks. The resulting
representations are novel, in that they combine apparently
immiscible aspects of features, pointers, and symbol
structures. They form a bridge between the data structures
necessary for high-level cognitive tasks and the associative,
pattern recognition machinery provided by neural networks."
}
@inproceedings{shen-2018-straight-tree,
title = "Straight to the Tree: Constituency Parsing with Neural
Syntactic Distance",
author = "Shen, Yikang and Lin, Zhouhan and Jacob, Athul Paul and
Sordoni, Alessandro and Courville, Aaron and Bengio, Yoshua",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1108",
doi = "10.18653/v1/P18-1108",
pages = "1171-1180",
abstract = "In this work, we propose a novel constituency parsing
scheme. The model first predicts a real-valued scalar, named
syntactic distance, for each split position in the
sentence. The topology of grammar tree is then determined by
the values of syntactic distances. Compared to traditional
shift-reduce parsing schemes, our approach is free from the
potentially disastrous compounding error. It is also easier
to parallelize and much faster. Our model achieves the
state-of-the-art single model F1 score of 92.1 on PTB and
86.4 on CTB dataset, which surpasses the previous single
model results by a large margin."
}
@inproceedings{socher-2012-mv-rnn,
title = "Semantic Compositionality through Recursive Matrix-Vector
Spaces",
author = "Socher, Richard and Huval, Brody and Manning, Christopher D.
and Ng, Andrew Y.",
booktitle = "Proceedings of the 2012 Joint Conference on Empirical Methods
in Natural Language Processing and Computational Natural
Language Learning",
month = jul,
year = 2012,
address = "Jeju Island, Korea",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D12-1110",
pages = "1201-1211"
}
@article{tai-2015-tree-lstm,
author = "Kai Sheng Tai and Richard Socher and Christopher D. Manning",
title = "Improved Semantic Representations From Tree-Structured Long
Short-Term Memory Networks",
journal = "CoRR",
volume = "abs/1503.00075",
year = 2015,
url = "http://arxiv.org/abs/1503.00075",
archivePrefix= "arXiv",
eprint = "1503.00075",
timestamp = "Mon, 13 Aug 2018 16:48:20 +0200",
biburl = "https://dblp.org/rec/journals/corr/TaiSM15.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{zhu-2020-crosswoz,
author = "{Zhu}, Qi and {Huang}, Kaili and {Zhang}, Zheng and {Zhu},
Xiaoyan and {Huang}, Minlie",
title = "{CrossWOZ: A Large-Scale Chinese Cross-Domain Task-Oriented
Dialogue Dataset}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2020,
month = feb,
eid = "arXiv:2002.11893",
pages = "arXiv:2002.11893",
archivePrefix= "arXiv",
eprint = "2002.11893",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2020arXiv200211893Z",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{nie-2019-k-multiple-means,
author = "Nie, Feiping and Wang, Cheng-Long and Li, Xuelong",
title = "K-Multiple-Means: A Multiple-Means Clustering Method with
Specified K Clusters",
year = 2019,
isbn = 9781450362016,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/3292500.3330846",
doi = "10.1145/3292500.3330846",
booktitle = "Proceedings of the 25th ACM SIGKDD International Conference
on Knowledge Discovery \& Data Mining",
pages = "959–967",
numpages = 9,
keywords = "graph laplacian, clustering, K-means, multiple means",
location = "Anchorage, AK, USA",
series = "KDD ’19"
}
@article{lee-2019-biobert,
author = "Jinhyuk Lee and Wonjin Yoon and Sungdong Kim and Donghyeon
Kim and Sunkyu Kim and Chan Ho So and Jaewoo Kang",
title = "BioBERT: a pre-trained biomedical language representation
model for biomedical text mining",
journal = "CoRR",
volume = "abs/1901.08746",
year = 2019,
url = "http://arxiv.org/abs/1901.08746",
archivePrefix= "arXiv",
eprint = "1901.08746",
timestamp = "Sat, 02 Feb 2019 16:56:00 +0100",
biburl = "https://dblp.org/rec/journals/corr/abs-1901-08746.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{alsentzer-2019-clinical-bert,
title = "Publicly Available Clinical {BERT} Embeddings",
author = "Alsentzer, Emily and Murphy, John and Boag, William and Weng,
Wei-Hung and Jindi, Di and Naumann, Tristan and McDermott,
Matthew",
booktitle = "Proceedings of the 2nd Clinical Natural Language Processing
Workshop",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W19-1909",
doi = "10.18653/v1/W19-1909",
pages = "72-78",
abstract = "Contextual word embedding models such as ELMo and BERT have
dramatically improved performance for many natural language
processing (NLP) tasks in recent months. However, these
models have been minimally explored on specialty corpora,
such as clinical text; moreover, in the clinical domain, no
publicly-available pre-trained BERT models yet exist. In this
work, we address this need by exploring and releasing BERT
models for clinical text: one for generic clinical text and
another for discharge summaries specifically. We demonstrate
that using a domain-specific model yields performance
improvements on 3/5 clinical NLP tasks, establishing a new
state-of-the-art on the MedNLI dataset. We find that these
domain-specific models are not as performant on 2 clinical
de-identification tasks, and argue that this is a natural
consequence of the differences between de-identified source
text and synthetically non de-identified task text."
}
@ARTICLE{shang-2019-g-bert,
author = "{Shang}, Junyuan and {Ma}, Tengfei and {Xiao}, Cao and {Sun},
Jimeng",
title = "{Pre-training of Graph Augmented Transformers for Medication
Recommendation}",
journal = "arXiv e-prints",
keywords = "Computer Science - Artificial Intelligence, Computer Science
- Computation and Language, Computer Science - Machine
Learning",
year = 2019,
month = jun,
eid = "arXiv:1906.00346",
pages = "arXiv:1906.00346",
archivePrefix= "arXiv",
eprint = "1906.00346",
primaryClass = "cs.AI",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190600346S",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{chevalier-boisvert-2019-babyai,
title = "Baby{AI}: First Steps Towards Grounded Language Learning With
a Human In the Loop",
author = "Maxime Chevalier-Boisvert and Dzmitry Bahdanau and Salem
Lahlou and Lucas Willems and Chitwan Saharia and Thien Huu
Nguyen and Yoshua Bengio",
booktitle = "International Conference on Learning Representations",
year = 2019,
url = "https://openreview.net/forum?id=rJeXCo0cYX"
}
@article{beltagy-2019-scibert,
author = "Iz Beltagy and Arman Cohan and Kyle Lo",
title = "SciBERT: Pretrained Contextualized Embeddings for Scientific
Text",
journal = "CoRR",
volume = "abs/1903.10676",
year = 2019,
url = "http://arxiv.org/abs/1903.10676",
archivePrefix= "arXiv",
eprint = "1903.10676",
timestamp = "Mon, 01 Apr 2019 14:07:37 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1903-10676.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{pires-2019-m-bert,
author = "{Pires}, Telmo and {Schlinger}, Eva and {Garrette}, Dan",
title = "{How multilingual is Multilingual BERT?}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Artificial Intelligence, Computer Science - Machine
Learning",
year = 2019,
month = jun,
eid = "arXiv:1906.01502",
pages = "arXiv:1906.01502",
archivePrefix= "arXiv",
eprint = "1906.01502",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190601502P",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{lee-2019-patent-bert,
author = "{Lee}, Jieh-Sheng and {Hsiang}, Jieh",
title = "{PatentBERT: Patent Classification with Fine-Tuning a
pre-trained BERT Model}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language, Computer Science
- Machine Learning, Statistics - Machine Learning",
year = 2019,
month = may,
eid = "arXiv:1906.02124",
pages = "arXiv:1906.02124",
archivePrefix= "arXiv",
eprint = "1906.02124",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190602124L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{adhikari-2019-docbert,
author = "Ashutosh Adhikari and Achyudh Ram and Raphael Tang and Jimmy
Lin",
title = "DocBERT: {BERT} for Document Classification",
journal = "CoRR",
volume = "abs/1904.08398",
year = 2019,
url = "http://arxiv.org/abs/1904.08398",
archivePrefix= "arXiv",
eprint = "1904.08398",
timestamp = "Fri, 26 Apr 2019 13:18:53 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1904-08398.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{coenen-2019-bert-geometry,
author = "Andy Coenen and Emily Reif and Ann Yuan and Been Kim and Adam
Pearce and Fernanda B. Vi{\'{e}}gas and Martin Wattenberg",
title = "Visualizing and Measuring the Geometry of {BERT}",
journal = "CoRR",
volume = "abs/1906.02715",
year = 2019,
url = "http://arxiv.org/abs/1906.02715",
archivePrefix= "arXiv",
eprint = "1906.02715",
timestamp = "Thu, 13 Jun 2019 13:36:00 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1906-02715.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{michel-2019-sixteen-heads,
author = "Paul Michel and Omer Levy and Graham Neubig",
title = "Are Sixteen Heads Really Better than One?",
journal = "CoRR",
volume = "abs/1905.10650",
year = 2019,
url = "http://arxiv.org/abs/1905.10650",
archivePrefix= "arXiv",
eprint = "1905.10650",
timestamp = "Mon, 03 Jun 2019 13:42:33 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1905-10650.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{zellers-2019-hellaswag,
author = "Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali
Farhadi and Yejin Choi",
title = "HellaSwag: Can a Machine Really Finish Your Sentence?",
journal = "CoRR",
volume = "abs/1905.07830",
year = 2019,
url = "http://arxiv.org/abs/1905.07830",
archivePrefix= "arXiv",
eprint = "1905.07830",
timestamp = "Tue, 28 May 2019 12:48:08 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1905-07830.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{tenney-2019-bert-rediscover,
author = "Ian Tenney and Dipanjan Das and Ellie Pavlick",
title = "{BERT} Rediscovers the Classical {NLP} Pipeline",
journal = "CoRR",
volume = "abs/1905.05950",
year = 2019,
url = "http://arxiv.org/abs/1905.05950",
archivePrefix= "arXiv",
eprint = "1905.05950",
timestamp = "Tue, 28 May 2019 12:48:08 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1905-05950",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{clark-2019-bert-attention,
author = "{Clark}, Kevin and {Khandelwal}, Urvashi and {Levy}, Omer and
{Manning}, Christopher D.",
title = "{What Does BERT Look At? An Analysis of BERT's Attention}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2019,
month = jun,
eid = "arXiv:1906.04341",
pages = "arXiv:1906.04341",
archivePrefix= "arXiv",
eprint = "1906.04341",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2019arXiv190604341C",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{loshchilov-2016-sgdr,
author = "{Loshchilov}, Ilya and {Hutter}, Frank",
title = "{SGDR: Stochastic Gradient Descent with Warm Restarts}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Neural and Evolutionary Computing, Mathematics - Optimization
and Control",
year = 2016,
month = aug,
eid = "arXiv:1608.03983",
pages = "arXiv:1608.03983",
archivePrefix= "arXiv",
eprint = "1608.03983",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160803983L",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{yin-2014-gsdmm,
author = "Yin, Jianhua and Wang, Jianyong",
title = "A Dirichlet Multinomial Mixture Model-Based Approach for
Short Text Clustering",
year = 2014,
isbn = 9781450329569,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2623330.2623715",
doi = "10.1145/2623330.2623715",
booktitle = "Proceedings of the 20th ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining",
pages = "233–242",
numpages = 10,
keywords = "gibbs sampling, short text clustering, dirichlet multinomial
mixture",
location = "New York, New York, USA",
series = "KDD ’14"
}
@inproceedings{yin-2016-fgsdmm-plus,
author = "Yin, Jianhua and Wang, Jianyong",
title = "A Text Clustering Algorithm Using an Online Clustering Scheme
for Initialization",
year = 2016,
isbn = 9781450342322,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2939672.2939841",
doi = "10.1145/2939672.2939841",
booktitle = "Proceedings of the 22nd ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining",
pages = "1995–2004",
numpages = 10,
keywords = "dirichlet multinomial mixture, gibbs sampling, text
clustering",
location = "San Francisco, California, USA",
series = "KDD ’16"
}
@article{nigam-2000-text-em,
title = "Text classification from labeled and unlabeled documents
using EM",
author = "Nigam, Kamal and McCallum, Andrew Kachites and Thrun,
Sebastian and Mitchell, Tom",
journal = "Machine learning",
volume = 39,
number = "2-3",
pages = "103-134",
year = 2000,
publisher = "Springer"
}
@article{holmes-2012-dmm,
title = "Dirichlet multinomial mixtures: generative models for
microbial metagenomics",
author = "Holmes, Ian and Harris, Keith and Quince, Christopher",
journal = "PloS one",
volume = 7,
number = 2,
year = 2012,
publisher = "Public Library of Science"
}
@inproceedings{li-2016-gpu-dmm,
title = "Topic modeling for short texts with auxiliary word
embeddings",
author = "Li, Chenliang and Wang, Haoran and Zhang, Zhiqian and Sun,
Aixin and Ma, Zongyang",
booktitle = "Proceedings of the 39th International ACM SIGIR conference on
Research and Development in Information Retrieval",
pages = "165-174",
year = 2016
}
@inproceedings{rangrej-2011-short-text-clustering-comparison,
title = "Comparative study of clustering techniques for short text
documents",
author = "Rangrej, Aniket and Kulkarni, Sayali and Tendulkar, Ashish V",
booktitle = "Proceedings of the 20th international conference companion on
World wide web",
pages = "111-112",
year = 2011
}
@article{pan-2009-transfer-survey,
title = "A survey on transfer learning",
author = "Pan, Sinno Jialin and Yang, Qiang",
journal = "IEEE Transactions on knowledge and data engineering",
volume = 22,
number = 10,
pages = "1345-1359",
year = 2009,
publisher = "IEEE"
}
@article{li-2012-tl-nlp-survey,
title = "Literature survey: domain adaptation algorithms for natural
language processing",
author = "Li, Qi",
journal = "Department of Computer Science The Graduate Center, The City
University of New York",
pages = "8-10",
year = 2012
}
@article{mao-2019-medgcn,
author = "Chengsheng Mao and Liang Yao and Yuan Luo",
title = "MedGCN: Graph Convolutional Networks for Multiple Medical
Tasks",
journal = "CoRR",
volume = "abs/1904.00326",
year = 2019,
url = "http://arxiv.org/abs/1904.00326",
archivePrefix= "arXiv",
eprint = "1904.00326",
timestamp = "Fri, 28 Jun 2019 09:35:46 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1904-00326.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{edwards-2016-neural-statistician,
author = "{Edwards}, Harrison and {Storkey}, Amos",
title = "{Towards a Neural Statistician}",
journal = "arXiv e-prints",
keywords = "Statistics - Machine Learning, Computer Science - Machine
Learning",
year = 2016,
month = jun,
eid = "arXiv:1606.02185",
pages = "arXiv:1606.02185",
archivePrefix= "arXiv",
eprint = "1606.02185",
primaryClass = "stat.ML",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160602185E",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{finn-2017-maml,
author = "{Finn}, Chelsea and {Abbeel}, Pieter and {Levine}, Sergey",
title = "{Model-Agnostic Meta-Learning for Fast Adaptation of Deep
Networks}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Artificial Intelligence, Computer Science - Computer Vision
and Pattern Recognition, Computer Science - Neural and
Evolutionary Computing",
year = 2017,
month = mar,
eid = "arXiv:1703.03400",
pages = "arXiv:1703.03400",
archivePrefix= "arXiv",
eprint = "1703.03400",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2017arXiv170303400F",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{vinyals-2016-matching-networks,
author = "Oriol Vinyals and Charles Blundell and Timothy P. Lillicrap
and Koray Kavukcuoglu and Daan Wierstra",
title = "Matching Networks for One Shot Learning",
journal = "CoRR",
volume = "abs/1606.04080",
year = 2016,
url = "http://arxiv.org/abs/1606.04080",
archivePrefix= "arXiv",
eprint = "1606.04080",
timestamp = "Mon, 13 Aug 2018 16:46:48 +0200",
biburl = "https://dblp.org/rec/journals/corr/VinyalsBLKW16.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{bhojanapalli-2020-low-rank-bottleneck,
author = "{Bhojanapalli}, Srinadh and {Yun}, Chulhee and {Singh Rawat},
Ankit and {Reddi}, Sashank J. and {Kumar}, Sanjiv",
title = "{Low-Rank Bottleneck in Multi-head Attention Models}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Statistics - Machine
Learning",
year = 2020,
month = feb,
eid = "arXiv:2002.07028",
pages = "arXiv:2002.07028",
archivePrefix= "arXiv",
eprint = "2002.07028",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2020arXiv200207028B",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@ARTICLE{shazeer-2020-talking-head,
author = "{Shazeer}, Noam and {Lan}, Zhenzhong and {Cheng}, Youlong and
{Ding}, Nan and {Hou}, Le",
title = "{Talking-Heads Attention}",
journal = "arXiv e-prints",
keywords = "Computer Science - Machine Learning, Computer Science -
Neural and Evolutionary Computing, Computer Science - Sound,
Electrical Engineering and Systems Science - Audio and Speech
Processing, Statistics - Machine Learning",
year = 2020,
month = mar,
eid = "arXiv:2003.02436",
pages = "arXiv:2003.02436",
archivePrefix= "arXiv",
eprint = "2003.02436",
primaryClass = "cs.LG",
adsurl = "https://ui.adsabs.harvard.edu/abs/2020arXiv200302436S",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@article{banerjee-2005-cluster-bregman,
title = "Clustering with Bregman divergences",
author = "Banerjee, Arindam and Merugu, Srujana and Dhillon, Inderjit S
and Ghosh, Joydeep",
journal = "Journal of machine learning research",
volume = 6,
number = "Oct",
pages = "1705-1749",
year = 2005
}
@article{he-2017-dureader,
author = "Wei He and Kai Liu and Yajuan Lyu and Shiqi Zhao and Xinyan
Xiao and Yuan Liu and Yizhong Wang and Hua Wu and Qiaoqiao
She and Xuan Liu and Tian Wu and Haifeng Wang",
title = "DuReader: a Chinese Machine Reading Comprehension Dataset
from Real-world Applications",
journal = "CoRR",
volume = "abs/1711.05073",
year = 2017,
url = "http://arxiv.org/abs/1711.05073",
archivePrefix= "arXiv",
eprint = "1711.05073",
timestamp = "Thu, 17 Oct 2019 16:06:13 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1711-05073.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@ARTICLE{rajpurkar-2018-squad-2,
author = "{Rajpurkar}, Pranav and {Jia}, Robin and {Liang}, Percy",
title = "{Know What You Don't Know: Unanswerable Questions for SQuAD}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2018,
month = jun,
eid = "arXiv:1806.03822",
pages = "arXiv:1806.03822",
archivePrefix= "arXiv",
eprint = "1806.03822",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2018arXiv180603822R",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{wang-2017-gated-self,
title = "Gated Self-Matching Networks for Reading Comprehension and
Question Answering",
author = "Wang, Wenhui and Yang, Nan and Wei, Furu and Chang, Baobao
and Zhou, Ming",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2017,
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-1018",
doi = "10.18653/v1/P17-1018",
pages = "189-198",
abstract = "In this paper, we present the gated self-matching networks
for reading comprehension style question answering, which
aims to answer questions from a given passage. We first match
the question and passage with gated attention-based recurrent
networks to obtain the question-aware passage
representation. Then we propose a self-matching attention
mechanism to refine the representation by matching the
passage against itself, which effectively encodes information
from the whole passage. We finally employ the pointer
networks to locate the positions of answers from the
passages. We conduct extensive experiments on the SQuAD
dataset. The single model achieves 71.3{\%} on the evaluation
metrics of exact match on the hidden test set, while the
ensemble model further boosts the results to 75.9{\%}. At the
time of submission of the paper, our model holds the first
place on the SQuAD leaderboard for both single and ensemble
model."
}
@article{santos-2016-attentive-pooling,
author = "Santos, Cicero dos and Tan, Ming and Xiang, Bing and Zhou,
Bowen",
title = "Attentive Pooling Networks",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1602.03609v1",
abstract = "In this work, we propose Attentive Pooling (AP), a two-way
attention mechanism for discriminative model training. In the
context of pair-wise ranking or classification with neural
networks, AP enables the pooling layer to be aware of the
current input pair, in a way that information from the two
input items can directly influence the computation of each
other's representations. Along with such representations of
the paired inputs, AP jointly learns a similarity measure
over projected segments (e.g. trigrams) of the pair, and
subsequently, derives the corresponding attention vector for
each input to guide the pooling. Our two-way attention
mechanism is a general framework independent of the
underlying representation learning, and it has been applied
to both convolutional neural networks (CNNs) and recurrent
neural networks (RNNs) in our studies. The empirical results,
from three very different benchmark tasks of question
answering/answer selection, demonstrate that our proposed
models outperform a variety of strong baselines and achieve
state-of-the-art performance in all the benchmarks.",
archivePrefix= "arXiv",
eprint = "1602.03609",
primaryClass = "cs.CL"
}
@inproceedings{girdhar-2017-attentional-pooling,
title = "Attentional pooling for action recognition",
author = "Girdhar, Rohit and Ramanan, Deva",
booktitle = "Advances in Neural Information Processing Systems",
pages = "34-45",
year = 2017
}
@inproceedings{iyyer-2015-word-dropout,
title = "Deep Unordered Composition Rivals Syntactic Methods for Text
Classification",
author = "Iyyer, Mohit and Manjunatha, Varun and Boyd-Graber, Jordan
and Daum{\'e} III, Hal",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for
Computational Linguistics and the 7th International Joint
Conference on Natural Language Processing (Volume 1: Long
Papers)",
month = jul,
year = 2015,
address = "Beijing, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P15-1162",
doi = "10.3115/v1/P15-1162",
pages = "1681-1691"
}
@article{gal-2015-rnn-dropout,
author = "Gal, Yarin and Ghahramani, Zoubin",
title = "A Theoretically Grounded Application of Dropout in Recurrent
Neural Networks",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1512.05287v5",
abstract = "Recurrent neural networks (RNNs) stand at the forefront of
many recent developments in deep learning. Yet a major
difficulty with these models is their tendency to overfit,
with dropout shown to fail when applied to recurrent
layers. Recent results at the intersection of Bayesian
modelling and deep learning offer a Bayesian interpretation
of common deep learning techniques such as dropout. This
grounding of dropout in approximate Bayesian inference
suggests an extension of the theoretical results, offering
insights into the use of dropout with RNN models. We apply
this new variational inference based dropout technique in
LSTM and GRU models, assessing it on language modelling and
sentiment analysis tasks. The new approach outperforms
existing techniques, and to the best of our knowledge
improves on the single model state-of-the-art in language
modelling with the Penn Treebank (73.4 test perplexity). This
extends our arsenal of variational tools in deep learning.",
archivePrefix= "arXiv",
eprint = "1512.05287",
primaryClass = "stat.ML"
}
@article{krueger-2016-zoneout,
author = "Krueger, David and Maharaj, Tegan and Kram{\'a}r, J{\'a}nos
and Pezeshki, Mohammad and Ballas, Nicolas and Ke, Nan
Rosemary and Goyal, Anirudh and Bengio, Yoshua and Courville,
Aaron and Pal, Chris",
title = "Zoneout: Regularizing Rnns By Randomly Preserving Hidden
Activations",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1606.01305v4",
abstract = "We propose zoneout, a novel method for regularizing RNNs. At
each timestep, zoneout stochastically forces some hidden
units to maintain their previous values. Like dropout,
zoneout uses random noise to train a pseudo-ensemble,
improving generalization. But by preserving instead of
dropping hidden units, gradient information and state
information are more readily propagated through time, as in
feedforward stochastic depth networks. We perform an
empirical investigation of various RNN regularizers, and find
that zoneout gives significant performance improvements
across tasks. We achieve competitive results with relatively
simple models in character- and word-level language modelling
on the Penn Treebank and Text8 datasets, and combining with
recurrent batch normalization yields state-of-the-art results
on permuted sequential MNIST.",
archivePrefix= "arXiv",
eprint = "1606.01305",
primaryClass = "cs.NE"
}
@article{merity-2017-drop-connect,
author = "Merity, Stephen and Keskar, Nitish Shirish and Socher,
Richard",
title = "Regularizing and Optimizing Lstm Language Models",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1708.02182v1",
abstract = "Recurrent neural networks (RNNs), such as long short-term
memory networks (LSTMs), serve as a fundamental building
block for many sequence learning tasks, including machine
translation, language modeling, and question answering. In
this paper, we consider the specific problem of word-level
language modeling and investigate strategies for regularizing
and optimizing LSTM-based models. We propose the
weight-dropped LSTM which uses DropConnect on
hidden-to-hidden weights as a form of recurrent
regularization. Further, we introduce NT-ASGD, a variant of
the averaged stochastic gradient method, wherein the
averaging trigger is determined using a non-monotonic
condition as opposed to being tuned by the user. Using these
and other regularization strategies, we achieve
state-of-the-art word level perplexities on two data sets:
57.3 on Penn Treebank and 65.8 on WikiText-2. In exploring
the effectiveness of a neural cache in conjunction with our
proposed model, we achieve an even lower state-of-the-art
perplexity of 52.8 on Penn Treebank and 52.0 on WikiText-2.",
archivePrefix= "arXiv",
eprint = "1708.02182",
primaryClass = "cs.CL"
}
@article{melis-2017-state-art,
author = "Melis, G{\'a}bor and Dyer, Chris and Blunsom, Phil",
title = "On the State of the Art of Evaluation in Neural Language
Models",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1707.05589v2",
abstract = "Ongoing innovations in recurrent neural network architectures
have provided a steady influx of apparently state-of-the-art
results on language modelling benchmarks. However, these have
been evaluated using differing code bases and limited
computational resources, which represent uncontrolled sources
of experimental variation. We reevaluate several popular
architectures and regularisation methods with large-scale
automatic black-box hyperparameter tuning and arrive at the
somewhat surprising conclusion that standard LSTM
architectures, when properly regularised, outperform more
recent models. We establish a new state of the art on the
Penn Treebank and Wikitext-2 corpora, as well as strong
baselines on the Hutter Prize dataset.",
archivePrefix= "arXiv",
eprint = "1707.05589",
primaryClass = "cs.CL"
}
@article{merity-2017-activation-regularization,
author = "Merity, Stephen and McCann, Bryan and Socher, Richard",
title = "Revisiting Activation Regularization for Language Rnns",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1708.01009v1",
abstract = "Recurrent neural networks (RNNs) serve as a fundamental
building block for many sequence tasks across natural
language processing. Recent research has focused on recurrent
dropout techniques or custom RNN cells in order to improve
performance. Both of these can require substantial
modifications to the machine learning model or to the
underlying RNN configurations. We revisit traditional
regularization techniques, specifically L2 regularization on
RNN activations and slowness regularization over successive
hidden states, to improve the performance of RNNs on the task
of language modeling. Both of these techniques require
minimal modification to existing RNN architectures and result
in performance improvements comparable or superior to more
complicated regularization techniques or custom cell
architectures. These regularization techniques can be used
without any modification on optimized LSTM implementations
such as the NVIDIA cuDNN LSTM.",
archivePrefix= "arXiv",
eprint = "1708.01009",
primaryClass = "cs.CL"
}
@article{ma-2016-expectation-linear-dropout,
author = "Ma, Xuezhe and Gao, Yingkai and Hu, Zhiting and Yu, Yaoliang
and Deng, Yuntian and Hovy, Eduard",
title = "Dropout With Expectation-Linear Regularization",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1609.08017v3",
abstract = "Dropout, a simple and effective way to train deep neural
networks, has led to a number of impressive empirical
successes and spawned many recent theoretical
investigations. However, the gap between dropout's training
and inference phases, introduced due to tractability
considerations, has largely remained under-appreciated. In
this work, we first formulate dropout as a tractable
approximation of some latent variable model, leading to a
clean view of parameter sharing and enabling further
theoretical analysis. Then, we introduce (approximate)
expectation-linear dropout neural networks, whose inference
gap we are able to formally characterize. Algorithmically, we
show that our proposed measure of the inference gap can be
used to regularize the standard dropout training objective,
resulting in an \emph{explicit} control of the gap. Our
method is as simple and efficient as standard dropout. We
further prove the upper bounds on the loss in accuracy due to
expectation-linearization, describe classes of input
distributions that expectation-linearize easily. Experiments
on three image classification benchmark datasets demonstrate
that reducing the inference gap can indeed improve the
performance consistently.",
archivePrefix= "arXiv",
eprint = "1609.08017",
primaryClass = "cs.LG"
}
@inproceedings{clare-2001-ml-dt,
author = "Clare, Amanda and King, Ross D.",
title = "Knowledge Discovery in Multi-Label Phenotype Data",
year = 2001,
isbn = 3540425349,
publisher = "Springer-Verlag",
address = "Berlin, Heidelberg",
booktitle = "Proceedings of the 5th European Conference on Principles of
Data Mining and Knowledge Discovery",
pages = "42–53",
numpages = 12,
series = "PKDD ’01"
}
@inproceedings{elisseeff-2001-rank-svm,
author = "Elisseeff, Andr\'{e} and Weston, Jason",
title = "A Kernel Method for Multi-Labelled Classification",
year = 2001,
publisher = "MIT Press",
address = "Cambridge, MA, USA",
booktitle = "Proceedings of the 14th International Conference on Neural
Information Processing Systems: Natural and Synthetic",
pages = "681–687",
numpages = 7,
location = "Vancouver, British Columbia, Canada",
series = "NIPS’01"
}
@article{zhang-2007-ml-knn,
author = "Zhang, Min-Ling and Zhou, Zhi-Hua",
title = "ML-KNN: A Lazy Learning Approach to Multi-Label Learning",
year = 2007,
issue_date = "July 2007",
publisher = "Elsevier Science Inc.",
address = "USA",
volume = 40,
number = 7,
issn = "0031-3203",
url = "https://doi.org/10.1016/j.patcog.2006.12.019",
doi = "10.1016/j.patcog.2006.12.019",
journal = "Pattern Recogn.",
month = jul,
pages = "2038–2048",
numpages = 11,
keywords = "Lazy learning, maximum a posteriori, Text categorization,
KNN, PMM, K-nearest neighbor, Multi-label learning, Natural
scene classification, Machine learning, Functional genomics,
ML-KNN, parametric mixture model, MAP, multi-label K-nearest
neighbor"
}
@inproceedings{papineni-2002-bleu,
author = "Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu,
Wei-Jing",
title = "{B}leu: a Method for Automatic Evaluation of Machine
Translation",
booktitle = "Proceedings of the 40th Annual Meeting of the Association for
Computational Linguistics",
year = 2002,
pages = "311-318",
doi = "10.3115/1073083.1073135",
url = "https://doi.org/10.3115/1073083.1073135",
address = "Philadelphia, Pennsylvania, USA",
month = jul,
publisher = "Association for Computational Linguistics"
}
@article{vijayakumar-2016-diverse-beam-search,
author = "Vijayakumar, Ashwin K and Cogswell, Michael and Selvaraju,
Ramprasath R. and Sun, Qing and Lee, Stefan and Crandall,
David and Batra, Dhruv",
title = "Diverse Beam Search: Decoding Diverse Solutions From Neural
Sequence Models",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1610.02424v2",
abstract = "Neural sequence models are widely used to model time-series
data. Equally ubiquitous is the usage of beam search (BS) as
an approximate inference algorithm to decode output sequences
from these models. BS explores the search space in a greedy
left-right fashion retaining only the top-B candidates -
resulting in sequences that differ only slightly from each
other. Producing lists of nearly identical sequences is not
only computationally wasteful but also typically fails to
capture the inherent ambiguity of complex AI tasks. To
overcome this problem, we propose Diverse Beam Search (DBS),
an alternative to BS that decodes a list of diverse outputs
by optimizing for a diversity-augmented objective. We observe
that our method finds better top-1 solutions by controlling
for the exploration and exploitation of the search space -
implying that DBS is a better search algorithm. Moreover,
these gains are achieved with minimal computational or memory
over- head as compared to beam search. To demonstrate the
broad applicability of our method, we present results on
image captioning, machine translation and visual question
generation using both standard quantitative metrics and
qualitative human studies. Further, we study the role of
diversity for image-grounded language generation tasks as the
complexity of the image changes. We observe that our method
consistently outperforms BS and previously proposed
techniques for diverse decoding from neural sequence models.",
archivePrefix= "arXiv",
eprint = "1610.02424",
primaryClass = "cs.AI"
}
@article{huszar-2015-schedule-sampling-problem,
journal = "CoRR",
title = "How (not) to Train your Generative Model: Scheduled Sampling,
Likelihood, Adversary?",
author = "Husz{\'a}r, Ferenc",
archivePrefix= "arXiv",
year = 2015,
eprint = "1511.05101",
primaryClass = "stat.ML",
abstract = "Modern applications and progress in deep learning research
have created renewed interest for generative models of text
and of images. However, even today it is unclear what
objective functions one should use to train and evaluate
these models. In this paper we present two contributions.
Firstly, we present a critique of scheduled sampling, a
state-of-the-art training method that contributed to the
winning entry to the MSCOCO image captioning benchmark in
2015. Here we show that despite this impressive empirical
performance, the objective function underlying scheduled
sampling is improper and leads to an inconsistent learning
algorithm. Secondly, we revisit the problems that scheduled
sampling was meant to address, and present an alternative
interpretation. We argue that maximum likelihood is an
inappropriate training objective when the end-goal is to
generate natural-looking samples. We go on to derive an ideal
objective function to use in this situation instead. We
introduce a generalisation of adversarial training, and show
how such method can interpolate between maximum likelihood
training and our ideal training objective. To our knowledge
this is the first theoretical analysis that explains why
adversarial training tends to produce samples with higher
perceived quality.",
url = "http://arxiv.org/abs/1511.05101v1"
}
@article{lamb-2016-professor-forcing,
author = "Lamb, Alex and Goyal, Anirudh and Zhang, Ying and Zhang,
Saizheng and Courville, Aaron and Bengio, Yoshua",
title = "Professor Forcing: a New Algorithm for Training Recurrent
Networks",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1610.09038v1",
abstract = "The Teacher Forcing algorithm trains recurrent networks by
supplying observed sequence values as inputs during training
and using the network's own one-step-ahead predictions to do
multi-step sampling. We introduce the Professor Forcing
algorithm, which uses adversarial domain adaptation to
encourage the dynamics of the recurrent network to be the
same when training the network and when sampling from the
network over multiple time steps. We apply Professor Forcing
to language modeling, vocal synthesis on raw waveforms,
handwriting generation, and image generation. Empirically we
find that Professor Forcing acts as a regularizer, improving
test likelihood on character level Penn Treebank and
sequential MNIST. We also find that the model qualitatively
improves samples, especially when sampling for a large number
of time steps. This is supported by human evaluation of
sample quality. Trade-offs between Professor Forcing and
Scheduled Sampling are discussed. We produce T-SNEs showing
that Professor Forcing successfully makes the dynamics of the
network during training and sampling more similar.",
archivePrefix= "arXiv",
eprint = "1610.09038",
primaryClass = "stat.ML"
}
@inproceedings{zhang-2019-train-infer-gap,
title = "Bridging the Gap between Training and Inference for Neural
Machine Translation",
author = "Zhang, Wen and Feng, Yang and Meng, Fandong and You, Di and
Liu, Qun",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1426",
doi = "10.18653/v1/P19-1426",
pages = "4334-4343",
abstract = "Neural Machine Translation (NMT) generates target words
sequentially in the way of predicting the next word
conditioned on the context words. At training time, it
predicts with the ground truth words as context while at
inference it has to generate the entire sequence from
scratch. This discrepancy of the fed context leads to error
accumulation among the way. Furthermore, word-level training
requires strict matching between the generated sequence and
the ground truth sequence which leads to overcorrection over
different but reasonable translations. In this paper, we
address these issues by sampling context words not only from
the ground truth sequence but also from the predicted
sequence by the model during training, where the predicted
sequence is selected with a sentence-level
optimum. Experiment results on Chinese-{\textgreater}English
and WMT{'}14 English-{\textgreater}German translation tasks
demonstrate that our approach can achieve significant
improvements on multiple datasets."
}
@article{hinton-2015-soft-target,
author = "Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff",
title = "Distilling the Knowledge in a Neural Network",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1503.02531v1",
abstract = "A very simple way to improve the performance of almost any
machine learning algorithm is to train many different models
on the same data and then to average their
predictions. Unfortunately, making predictions using a whole
ensemble of models is cumbersome and may be too
computationally expensive to allow deployment to a large
number of users, especially if the individual models are
large neural nets. Caruana and his collaborators have shown
that it is possible to compress the knowledge in an ensemble
into a single model which is much easier to deploy and we
develop this approach further using a different compression
technique. We achieve some surprising results on MNIST and we
show that we can significantly improve the acoustic model of
a heavily used commercial system by distilling the knowledge
in an ensemble of models into a single model. We also
introduce a new type of ensemble composed of one or more full
models and many specialist models which learn to distinguish
fine-grained classes that the full models confuse. Unlike a
mixture of experts, these specialist models can be trained
rapidly and in parallel.",
archivePrefix= "arXiv",
eprint = "1503.02531",
primaryClass = "stat.ML"
}
@article{tang-2015-soft-target,
author = "Tang, Zhiyuan and Wang, Dong and Zhang, Zhiyong",
title = "Recurrent Neural Network Training With Dark Knowledge
Transfer",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1505.04630v5",
abstract = "Recurrent neural networks (RNNs), particularly long
short-term memory (LSTM), have gained much attention in
automatic speech recognition (ASR). Although some successful
stories have been reported, training RNNs remains highly
challenging, especially with limited training data. Recent
research found that a well-trained model can be used as a
teacher to train other child models, by using the predictions
generated by the teacher model as supervision. This knowledge
transfer learning has been employed to train simple neural
nets with a complex one, so that the final performance can
reach a level that is infeasible to obtain by regular
training. In this paper, we employ the knowledge transfer
learning approach to train RNNs (precisely LSTM) using a deep
neural network (DNN) model as the teacher. This is different
from most of the existing research on knowledge transfer
learning, since the teacher (DNN) is assumed to be weaker
than the child (RNN); however, our experiments on an ASR task
showed that it works fairly well: without applying any tricks
on the learning scheme, this approach can train RNNs
successfully even with limited training data.",
archivePrefix= "arXiv",
eprint = "1505.04630",
primaryClass = "stat.ML"
}
@article{you-2017-lars,
author = "You, Yang and Gitman, Igor and Ginsburg, Boris",
title = "Large Batch Training of Convolutional Networks",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1708.03888v3",
abstract = "A common way to speed up training of large convolutional
networks is to add computational units. Training is then
performed using data-parallel synchronous Stochastic Gradient
Descent (SGD) with mini-batch divided between computational
units. With an increase in the number of nodes, the batch
size grows. But training with large batch size often results
in the lower model accuracy. We argue that the current recipe
for large batch training (linear learning rate scaling with
warm-up) is not general enough and training may diverge. To
overcome this optimization difficulties we propose a new
training algorithm based on Layer-wise Adaptive Rate Scaling
(LARS). Using LARS, we scaled Alexnet up to a batch size of
8K, and Resnet-50 to a batch size of 32K without loss in
accuracy.",
archivePrefix= "arXiv",
eprint = "1708.03888",
primaryClass = "cs.CV"
}
@article{le-2015-identity-rnn,
author = "Le, Quoc V. and Jaitly, Navdeep and Hinton, Geoffrey E.",
title = "A Simple Way To Initialize Recurrent Networks of Rectified
Linear Units",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1504.00941v2",
abstract = "Learning long term dependencies in recurrent networks is
difficult due to vanishing and exploding gradients. To
overcome this difficulty, researchers have developed
sophisticated optimization techniques and network
architectures. In this paper, we propose a simpler solution
that use recurrent neural networks composed of rectified
linear units. Key to our solution is the use of the identity
matrix or its scaled version to initialize the recurrent
weight matrix. We find that our solution is comparable to
LSTM on our four benchmarks: two toy problems involving
long-range temporal structures, a large language modeling
problem and a benchmark speech recognition problem.",
archivePrefix= "arXiv",
eprint = "1504.00941",
primaryClass = "cs.NE"
}
@inproceedings{bi-2013-efficient-multi-label,
author = "Bi, Wei and Kwok, James T.",
title = "Efficient Multi-Label Classification with Many Labels",
year = 2013,
publisher = "JMLR.org",
booktitle = "Proceedings of the 30th International Conference on
International Conference on Machine Learning - Volume 28",
pages = "III–405–III–413",
numpages = 9,
location = "Atlanta, GA, USA",
series = "ICML’13"
}
@article{raffel-2019-t5,
author = "Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee,
Katherine and Narang, Sharan and Matena, Michael and Zhou,
Yanqi and Li, Wei and Liu, Peter J.",
title = "Exploring the Limits of Transfer Learning With a Unified
Text-To-Text Transformer",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1910.10683v2",
abstract = "Transfer learning, where a model is first pre-trained on a
data-rich task before being fine-tuned on a downstream task,
has emerged as a powerful technique in natural language
processing (NLP). The effectiveness of transfer learning has
given rise to a diversity of approaches, methodology, and
practice. In this paper, we explore the landscape of transfer
learning techniques for NLP by introducing a unified
framework that converts every language problem into a
text-to-text format. Our systematic study compares
pre-training objectives, architectures, unlabeled datasets,
transfer approaches, and other factors on dozens of language
understanding tasks. By combining the insights from our
exploration with scale and our new ``Colossal Clean Crawled
Corpus``, we achieve state-of-the-art results on many
benchmarks covering summarization, question answering, text
classification, and more. To facilitate future work on
transfer learning for NLP, we release our dataset,
pre-trained models, and code.",
archivePrefix= "arXiv",
eprint = "1910.10683",
primaryClass = "cs.LG"
}
@inproceedings{kolitsas-2018-end-to-end-el,
author = "Kolitsas, Nikolaos and Ganea, Octavian-Eugen and Hofmann,
Thomas",
title = "End-to-End Neural Entity Linking",
booktitle = "Proceedings of the 22nd Conference on Computational Natural
Language Learning",
year = 2018,
pages = "519-529",
doi = "10.18653/v1/K18-1050",
url = "https://doi.org/10.18653/v1/K18-1050",
abstract = "Entity Linking (EL) is an essential task for semantic text
understanding and information extraction. Popular methods
separately address the Mention Detection (MD) and Entity
Disambiguation (ED) stages of EL, without leveraging their
mutual dependency. We here propose the first neural
end-to-end EL system that jointly discovers and links
entities in a text document. The main idea is to consider all
possible spans as potential mentions and learn contextual
similarity scores over their entity candidates that are
useful for both MD and ED decisions. Key components are
context-aware mention embeddings, entity embeddings and a
probabilistic mention - entity map, without demanding other
engineered features. Empirically, we show that our end-to-end
method significantly outperforms popular systems on the
Gerbil platform when enough training data is
available. Conversely, if testing datasets follow different
annotation conventions compared to the training set
(e.g. queries/ tweets vs news documents), our ED model
coupled with a traditional NER system offers the best or
second best EL accuracy.",
address = "Brussels, Belgium",
month = oct,
publisher = "Association for Computational Linguistics"
}
@article{raiman-2018-deeptype,
author = "Raiman, Jonathan and Raiman, Olivier",
title = "Deeptype: Multilingual Entity Linking By Neural Type System
Evolution",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1802.01021v1",
abstract = "The wealth of structured (e.g. Wikidata) and unstructured
data about the world available today presents an incredible
opportunity for tomorrow's Artificial Intelligence. So far,
integration of these two different modalities is a difficult
process, involving many decisions concerning how best to
represent the information so that it will be captured or
useful, and hand-labeling large amounts of data. DeepType
overcomes this challenge by explicitly integrating symbolic
information into the reasoning process of a neural network
with a type system. First we construct a type system, and
second, we use it to constrain the outputs of a neural
network to respect the symbolic structure. We achieve this by
reformulating the design problem into a mixed integer
problem: create a type system and subsequently train a neural
network with it. In this reformulation discrete variables
select which parent-child relations from an ontology are
types within the type system, while continuous variables
control a classifier fit to the type system. The original
problem cannot be solved exactly, so we propose a 2-step
algorithm: 1) heuristic search or stochastic optimization
over discrete variables that define a type system informed by
an Oracle and a Learnability heuristic, 2) gradient descent
to fit classifier parameters. We apply DeepType to the
problem of Entity Linking on three standard datasets
(i.e. WikiDisamb30, CoNLL (YAGO), TAC KBP 2010) and find that
it outperforms all existing solutions by a wide margin,
including approaches that rely on a human-designed type
system or recent deep learning-based entity embeddings, while
explicitly using symbolic information lets it integrate new
entities without retraining.",
archivePrefix= "arXiv",
eprint = "1802.01021",
primaryClass = "cs.CL"
}
@inproceedings{le-2018-el-latent-relation,
title = "Improving Entity Linking by Modeling Latent Relations between
Mentions",
author = "Le, Phong and Titov, Ivan",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1148",
doi = "10.18653/v1/P18-1148",
pages = "1595-1604",
abstract = "Entity linking involves aligning textual mentions of named
entities to their corresponding entries in a knowledge
base. Entity linking systems often exploit relations between
textual mentions in a document (e.g., coreference) to decide
if the linking decisions are compatible. Unlike previous
approaches, which relied on supervised systems or heuristics
to predict these relations, we treat relations as latent
variables in our neural entity-linking model. We induce the
relations without any supervision while optimizing the
entity-linking system in an end-to-end fashion. Our
multi-relational model achieves the best reported scores on
the standard benchmark (AIDA-CoNLL) and substantially
outperforms its relation-agnostic version. Its training also
converges much faster, suggesting that the injected
structural bias helps to explain regularities in the training
data."
}
@inproceedings{ganea-2017-deep-ed,
title = "Deep Joint Entity Disambiguation with Local Neural Attention",
author = "Ganea, Octavian-Eugen and Hofmann, Thomas",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1277",
doi = "10.18653/v1/D17-1277",
pages = "2619-2629",
abstract = "We propose a novel deep learning model for joint
document-level entity disambiguation, which leverages learned
neural representations. Key components are entity embeddings,
a neural attention mechanism over local context windows, and
a differentiable joint inference stage for
disambiguation. Our approach thereby combines benefits of
deep learning with more traditional approaches such as
graphical models and probabilistic mention-entity
maps. Extensive experiments show that we are able to obtain
competitive or state-of-the-art accuracy at moderate
computational costs."
}
@article{vashishth-2020-medtype,
author = "Vashishth, Shikhar and Joshi, Rishabh and Dutt, Ritam and
Newman-Griffis, Denis and Rose, Carolyn",
title = "Medtype: Improving Medical Entity Linking With Semantic Type
Prediction",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2005.00460v1",
abstract = "Medical entity linking is the task of identifying and
standardizing concepts referred in a scientific article or
clinical record. Existing methods adopt a two-step approach
of detecting mentions and identifying a list of candidate
concepts for them. In this paper, we probe the impact of
incorporating an entity disambiguation step in existing
entity linkers. For this, we present MedType, a novel method
that leverages the surrounding context to identify the
semantic type of a mention and uses it for filtering out
candidate concepts of the wrong types. We further present two
novel largescale, automatically-created datasets of medical
entity mentions: WIKIMED, a Wikipediabased dataset for
cross-domain transfer learning, and PUBMEDDS, a
distantly-supervised dataset of medical entity mentions in
biomedical abstracts. Through extensive experiments across
several datasets and methods, we demonstrate that MedType
pre-trained on our proposed datasets substantially improve
medical entity linking and gives state-of-the-art
performance. We make our source code and datasets publicly
available for medical entity linking research.",
archivePrefix= "arXiv",
eprint = "2005.00460",
primaryClass = "cs.CL"
}
@article{shi-2020-sentence-level-el,
author = "Shi, Wei and Zhang, Siyuan and Zhang, Zhiwei and Cheng, Hong
and Yu, Jeffrey Xu",
title = "Joint Embedding in Named Entity Linking on Sentence Level",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2002.04936v1",
abstract = "Named entity linking is to map an ambiguous mention in
documents to an entity in a knowledge base. The named entity
linking is challenging, given the fact that there are
multiple candidate entities for a mention in a document. It
is difficult to link a mention when it appears multiple times
in a document, since there are conflicts by the contexts
around the appearances of the mention. In addition, it is
difficult since the given training dataset is small due to
the reason that it is done manually to link a mention to its
mapping entity. In the literature, there are many reported
studies among which the recent embedding methods learn
vectors of entities from the training dataset at document
level. To address these issues, we focus on how to link
entity for mentions at a sentence level, which reduces the
noises introduced by different appearances of the same
mention in a document at the expense of insufficient
information to be used. We propose a new unified embedding
method by maximizing the relationships learned from knowledge
graphs. We confirm the effectiveness of our method in our
experimental studies.",
archivePrefix= "arXiv",
eprint = "2002.04936",
primaryClass = "cs.CL"
}
@article{broscheit-2020-bert-el,
author = "Broscheit, Samuel",
title = "Investigating Entity Knowledge in Bert With Simple Neural
End-To-End Entity Linking",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2003.05473v1",
abstract = "A typical architecture for end-to-end entity linking systems
consists of three steps: mention detection, candidate
generation and entity disambiguation. In this study we
investigate the following questions: (a) Can all those steps
be learned jointly with a model for contextualized
text-representations, i.e. BERT (Devlin et al., 2019)? (b)
How much entity knowledge is already contained in pretrained
BERT? (c) Does additional entity knowledge improve BERT's
performance in downstream tasks? To this end, we propose an
extreme simplification of the entity linking setup that works
surprisingly well: simply cast it as a per token
classification over the entire entity vocabulary (over 700K
classes in our case). We show on an entity linking benchmark
that (i) this model improves the entity representations over
plain BERT, (ii) that it outperforms entity linking
architectures that optimize the tasks separately and (iii)
that it only comes second to the current state-of-the-art
that does mention detection and entity disambiguation
jointly. Additionally, we investigate the usefulness of
entity-aware token-representations in the text-understanding
benchmark GLUE, as well as the question answering benchmarks
SQUAD V2 and SWAG and also the EN-DE WMT14 machine
translation benchmark. To our surprise, we find that most of
those benchmarks do not benefit from additional entity
knowledge, except for a task with very small training data,
the RTE task in GLUE, which improves by 2 \%.",
archivePrefix= "arXiv",
eprint = "2003.05473",
primaryClass = "cs.CL"
}
@article{chen-2020-latent-entity-type,
author = "Chen, Shuang and Wang, Jinpeng and Jiang, Feng and Lin,
Chin-Yew",
title = "Improving Entity Linking By Modeling Latent Entity Type
Information",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2001.01447v1",
abstract = "Existing state of the art neural entity linking models employ
attention-based bag-of-words context model and pre-trained
entity embeddings bootstrapped from word embeddings to assess
topic level context compatibility. However, the latent entity
type information in the immediate context of the mention is
neglected, which causes the models often link mentions to
incorrect entities with incorrect type. To tackle this
problem, we propose to inject latent entity type information
into the entity embeddings based on pre-trained BERT. In
addition, we integrate a BERT-based entity similarity score
into the local context model of a state-of-the-art model to
better capture latent entity type information. Our model
significantly outperforms the state-of-the-art entity linking
models on standard benchmark (AIDA-CoNLL). Detailed
experiment analysis demonstrates that our model corrects most
of the type errors produced by the direct baseline.",
archivePrefix= "arXiv",
eprint = "2001.01447",
primaryClass = "cs.CL"
}
@article{zhu-2019-latte,
author = "Zhu, Ming and Celikkaya, Busra and Bhatia, Parminder and
Reddy, Chandan K.",
title = "Latte: Latent Type Modeling for Biomedical Entity Linking",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1911.09787v2",
abstract = "Entity linking is the task of linking mentions of named
entities in natural language text, to entities in a curated
knowledge-base. This is of significant importance in the
biomedical domain, where it could be used to semantically
annotate a large volume of clinical records and biomedical
literature, to standardized concepts described in an ontology
such as Unified Medical Language System (UMLS). We observe
that with precise type information, entity disambiguation
becomes a straightforward task. However, fine-grained type
information is usually not available in biomedical
domain. Thus, we propose LATTE, a LATent Type Entity Linking
model, that improves entity linking by modeling the latent
fine-grained type information about mentions and entities.
Unlike previous methods that perform entity linking directly
between the mentions and the entities, LATTE jointly does
entity disambiguation, and latent fine-grained type learning,
without direct supervision. We evaluate our model on two
biomedical datasets: MedMentions, a large scale public
dataset annotated with UMLS concepts, and a de-identified
corpus of dictated doctor's notes that has been annotated
with ICD concepts. Extensive experimental evaluation shows
our model achieves significant performance improvements over
several state-of-the-art techniques.",
archivePrefix= "arXiv",
eprint = "1911.09787",
primaryClass = "cs.CL"
}
@article{chen-2019-yelm,
author = "Chen, Haotian and Wadhwa, Sahil and Li, Xi David and
Zukov-Gregoric, Andrej",
title = "Yelm: End-To-End Contextualized Entity Linking",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1911.03834v1",
abstract = "We propose yet another entity linking model (YELM) which
links words to entities instead of spans. This overcomes any
difficulties associated with the selection of good candidate
mention spans and makes the joint training of mention
detection (MD) and entity disambiguation (ED) easily
possible. Our model is based on BERT and produces
contextualized word embeddings which are trained against a
joint MD and ED objective. We achieve state-of-the-art
results on several standard entity linking (EL) datasets.",
archivePrefix= "arXiv",
eprint = "1911.03834",
primaryClass = "cs.CL"
}
@article{martins-2019-joint-ner-el,
author = "Martins, Pedro Henrique and Marinho, Zita and Martins,
Andr{\'e} F. T.",
title = "Joint Learning of Named Entity Recognition and Entity
Linking",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1907.08243v1",
abstract = "Named entity recognition (NER) and entity linking (EL) are
two fundamentally related tasks, since in order to perform
EL, first the mentions to entities have to be
detected. However, most entity linking approaches disregard
the mention detection part, assuming that the correct
mentions have been previously detected. In this paper, we
perform joint learning of NER and EL to leverage their
relatedness and obtain a more robust and generalisable
system. For that, we introduce a model inspired by the
Stack-LSTM approach (Dyer et al., 2015). We observe that, in
fact, doing multi-task learning of NER and EL improves the
performance in both tasks when comparing with models trained
with individual objectives. Furthermore, we achieve results
competitive with the state-of-the-art in both NER and EL.",
archivePrefix= "arXiv",
eprint = "1907.08243",
primaryClass = "cs.CL"
}
@inproceedings{logeswaran-2019-zero-shot-el,
title = "Zero-Shot Entity Linking by Reading Entity Descriptions",
author = "Logeswaran, Lajanugen and Chang, Ming-Wei and Lee, Kenton and
Toutanova, Kristina and Devlin, Jacob and Lee, Honglak",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1335",
doi = "10.18653/v1/P19-1335",
pages = "3449-3460",
abstract = "We present the zero-shot entity linking task, where mentions
must be linked to unseen entities without in-domain labeled
data. The goal is to enable robust transfer to highly
specialized domains, and so no metadata or alias tables are
assumed. In this setting, entities are only identified by
text descriptions, and models must rely strictly on language
understanding to resolve the new entities. First, we show
that strong reading comprehension models pre-trained on large
unlabeled data can be used to generalize to unseen
entities. Second, we propose a simple and effective adaptive
pre-training strategy, which we term domain-adaptive
pre-training (DAP), to address the domain shift problem
associated with linking unseen entities in a new domain. We
present experiments on a new dataset that we construct for
this task and show that DAP improves over strong pre-training
baselines, including BERT. The data and code are available at
https://github.com/lajanugen/zeshel."
}
@inproceedings{le-2019-distant-el,
title = "Distant Learning for Entity Linking with Automatic Noise
Detection",
author = "Le, Phong and Titov, Ivan",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1400",
doi = "10.18653/v1/P19-1400",
pages = "4081-4090",
abstract = "Accurate entity linkers have been produced for domains and
languages where annotated data (i.e., texts linked to a
knowledge base) is available. However, little progress has
been made for the settings where no or very limited amounts
of labeled data are present (e.g., legal or most scientific
domains). In this work, we show how we can learn to link
mentions without having any labeled examples, only a
knowledge base and a collection of unannotated texts from the
corresponding domain. In order to achieve this, we frame the
task as a multi-instance learning problem and rely on surface
matching to create initial noisy labels. As the learning
signal is weak and our surrogate labels are noisy, we
introduce a noise detection component in our model: it lets
the model detect and disregard examples which are likely to
be noisy. Our method, jointly learning to detect noise and
link entities, greatly outperforms the surface matching
baseline. For a subset of entity categories, it even
approaches the performance of supervised learning."
}
@inproceedings{mondal-2019-triplet-network-el,
title = "Medical Entity Linking using Triplet Network",
author = "Mondal, Ishani and Purkayastha, Sukannya and Sarkar, Sudeshna
and Goyal, Pawan and Pillai, Jitesh and Bhattacharyya,
Amitava and Gattu, Mahanandeeshwar",
booktitle = "Proceedings of the 2nd Clinical Natural Language Processing
Workshop",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W19-1912",
doi = "10.18653/v1/W19-1912",
pages = "95-100",
abstract = "Entity linking (or Normalization) is an essential task in
text mining that maps the entity mentions in the medical text
to standard entities in a given Knowledge Base (KB). This
task is of great importance in the medical domain. It can
also be used for merging different medical and clinical
ontologies. In this paper, we center around the problem of
disease linking or normalization. This task is executed in
two phases: candidate generation and candidate scoring. In
this paper, we present an approach to rank the candidate
Knowledge Base entries based on their similarity with disease
mention. We make use of the Triplet Network for candidate
ranking. While the existing methods have used carefully
generated sieves and external resources for candidate
generation, we introduce a robust and portable candidate
generation scheme that does not make use of the hand-crafted
rules. Experimental results on the standard benchmark NCBI
disease dataset demonstrate that our system outperforms the
prior methods by a significant margin."
}
@article{yang-2019-dca,
author = "Yang, Xiyuan and Gu, Xiaotao and Lin, Sheng and Tang, Siliang
and Zhuang, Yueting and Wu, Fei and Chen, Zhigang and Hu,
Guoping and Ren, Xiang",
title = "Learning Dynamic Context Augmentation for Global Entity
Linking",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1909.02117v1",
abstract = "Despite of the recent success of collective entity linking
(EL) methods, these ``global`` inference methods may yield
sub-optimal results when the ``all-mention coherence``
assumption breaks, and often suffer from high computational
cost at the inference stage, due to the complex search
space. In this paper, we propose a simple yet effective
solution, called Dynamic Context Augmentation (DCA), for
collective EL, which requires only one pass through the
mentions in a document. DCA sequentially accumulates context
information to make efficient, collective inference, and can
cope with different local EL models as a plug-and-enhance
module. We explore both supervised and reinforcement learning
strategies for learning the DCA model. Extensive experiments
show the effectiveness of our model with different learning
settings, base models, decision orders and attention
mechanisms.",
archivePrefix= "arXiv",
eprint = "1909.02117",
primaryClass = "cs.CL"
}
@inproceedings{murty-2018-hierarchical-losses,
title = "Hierarchical Losses and New Resources for Fine-grained Entity
Typing and Linking",
author = "Murty, Shikhar and Verga, Patrick and Vilnis, Luke and
Radovanovic, Irena and McCallum, Andrew",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1010",
doi = "10.18653/v1/P18-1010",
pages = "97-109",
abstract = "Extraction from raw text to a knowledge base of entities and
fine-grained types is often cast as prediction into a flat
set of entity and type labels, neglecting the rich
hierarchies over types and entities contained in curated
ontologies. Previous attempts to incorporate hierarchical
structure have yielded little benefit and are restricted to
shallow ontologies. This paper presents new methods using
real and complex bilinear mappings for integrating
hierarchical information, yielding substantial improvement
over flat predictions in entity linking and fine-grained
entity typing, and achieving new state-of-the-art results for
end-to-end models on the benchmark FIGER dataset. We also
present two new human-annotated datasets containing wide and
deep hierarchies which we will release to the community to
encourage further research in this direction:
\textit{MedMentions}, a collection of PubMed abstracts in
which 246k mentions have been mapped to the massive UMLS
ontology; and \textit{TypeNet}, which aligns Freebase types
with the WordNet hierarchy to obtain nearly 2k entity
types. In experiments on all three datasets we show
substantial gains from hierarchy-aware training."
}
@inproceedings{zhong-2018-colink,
title = "Colink: An unsupervised framework for user identity linkage",
author = "Zhong, Zexuan and Cao, Yong and Guo, Mu and Nie, Zaiqing",
booktitle = "Thirty-Second AAAI Conference on Artificial Intelligence",
year = 2018
}
@inproceedings{du-2019-extract-symptoms,
title = "Extracting Symptoms and their Status from Clinical
Conversations",
author = "Du, Nan and Chen, Kai and Kannan, Anjuli and Tran, Linh and
Chen, Yuhui and Shafran, Izhak",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1087",
doi = "10.18653/v1/P19-1087",
pages = "915-925",
abstract = "This paper describes novel models tailored for a new
application, that of extracting the symptoms mentioned in
clinical conversations along with their status. Lack of any
publicly available corpus in this privacy-sensitive domain
led us to develop our own corpus, consisting of about 3K
conversations annotated by professional medical scribes. We
propose two novel deep learning approaches to infer the
symptom names and their status: (1) a new hierarchical
span-attribute tagging (SA-T) model, trained using curriculum
learning, and (2) a variant of sequence-to-sequence model
which decodes the symptoms and their status from a few
speaker turns within a sliding window over the
conversation. This task stems from a realistic application of
assisting medical providers in capturing symptoms mentioned
by patients from their clinical conversations. To reflect
this application, we define multiple metrics. From
inter-rater agreement, we find that the task is inherently
difficult. We conduct comprehensive evaluations on several
contrasting conditions and observe that the performance of
the models range from an F-score of 0.5 to 0.8 depending on
the condition. Our analysis not only reveals the inherent
challenges of the task, but also provides useful directions
to improve the models."
}
@article{sarrouti-2020-sembionlqa,
title = "SemBioNLQA: A semantic biomedical question answering system
for retrieving exact and ideal answers to natural language
questions",
journal = "Artificial Intelligence in Medicine",
volume = 102,
pages = 101767,
year = 2020,
issn = "0933-3657",
doi = "https://doi.org/10.1016/j.artmed.2019.101767",
url =
"http://www.sciencedirect.com/science/article/pii/S0933365718302756",
author = "Mourad Sarrouti and Said [Ouatik El Alaoui]",
keywords = "Biomedical question answering, Information retrieval, Passage
retrieval, Natural language processing, Machine learning,
Biomedical informatics, BioASQ",
abstract = "Background and objective Question answering (QA), the
identification of short accurate answers to users questions
written in natural language expressions, is a longstanding
issue widely studied over the last decades in the
open-domain. However, it still remains a real challenge in
the biomedical domain as the most of the existing systems
support a limited amount of question and answer types as well
as still require further efforts in order to improve their
performance in terms of precision for the supported
questions. Here, we present a semantic biomedical QA system
named SemBioNLQA which has the ability to handle the kinds of
yes/no, factoid, list, and summary natural language
questions. Methods This paper describes the system
architecture and an evaluation of the developed end-to-end
biomedical QA system named SemBioNLQA, which consists of
question classification, document retrieval, passage
retrieval and answer extraction modules. It takes natural
language questions as input, and outputs both short precise
answers and summaries as results. The SemBioNLQA system,
dealing with four types of questions, is based on (1)
handcrafted lexico-syntactic patterns and a machine learning
algorithm for question classification, (2) PubMed search
engine and UMLS similarity for document retrieval, (3) the
BM25 model, stemmed words and UMLS concepts for passage
retrieval, and (4) UMLS metathesaurus, BioPortal synonyms,
sentiment analysis and term frequency metric for answer
extraction. Results and conclusion Compared with the current
state-of-the-art biomedical QA systems, SemBioNLQA, a fully
automated system, has the potential to deal with a large
amount of question and answer types. SemBioNLQA retrieves
quickly users’ information needs by returning exact answers
(e.g., “yes”, “no”, a biomedical entity name, etc.) and ideal
answers (i.e., paragraph-sized summaries of relevant
information) for yes/no, factoid and list questions, whereas
it provides only the ideal answers for summary
questions. Moreover, experimental evaluations performed on
biomedical questions and answers provided by the BioASQ
challenge especially in 2015, 2016 and 2017 (as part of our
participation), show that SemBioNLQA achieves good
performances compared with the most current state-of-the-art
systems and allows a practical and competitive alternative to
help information seekers find exact and ideal answers to
their biomedical questions. The SemBioNLQA source code is
publicly available at
https://github.com/sarrouti/sembionlqa."
}
@article{demner-fushman-2019-health-qa,
title = "Consumer health information and question answering: helping
consumers find answers to their health-related information
needs",
author = "Dina Demner-Fushman and Yassine Mrabet and Asma Ben Abacha",
journal = "Journal of the American Medical Informatics Association :
JAMIA",
year = 2019
}
@inproceedings{lin-2019-symptom-graph,
title = "Enhancing Dialogue Symptom Diagnosis with Global Attention
and Symptom Graph",
author = "Lin, Xinzhu and He, Xiahui and Chen, Qin and Tou, Huaixiao
and Wei, Zhongyu and Chen, Ting",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1508",
doi = "10.18653/v1/D19-1508",
pages = "5033-5042",
abstract = "Symptom diagnosis is a challenging yet profound problem in
natural language processing. Most previous research focus on
investigating the standard electronic medical records for
symptom diagnosis, while the dialogues between doctors and
patients that contain more rich information are not well
studied. In this paper, we first construct a dialogue symptom
diagnosis dataset based on an online medical forum with a
large amount of dialogues between patients and doctors. Then,
we provide some benchmark models on this dataset to boost the
research of dialogue symptom diagnosis. In order to further
enhance the performance of symptom diagnosis over dialogues,
we propose a global attention mechanism to capture more
symptom related information, and build a symptom graph to
model the associations between symptoms rather than treating
each symptom independently. Experimental results show that
both the global attention and symptom graph are effective to
boost dialogue symptom diagnosis. In particular, our proposed
model achieves the state-of-the-art performance on the
constructed dataset."
}
@inproceedings{dusek-2016-context-aware,
title = "A Context-aware Natural Language Generator for Dialogue
Systems",
author = "Du{\v{s}}ek, Ond{\v{r}}ej and Jur{\v{c}}{\'\i}{\v{c}}ek,
Filip",
booktitle = "Proceedings of the 17th Annual Meeting of the Special
Interest Group on Discourse and Dialogue",
month = sep,
year = 2016,
address = "Los Angeles",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W16-3622",
doi = "10.18653/v1/W16-3622",
pages = "185-190"
}
@inproceedings{ghosal-2019-dialogue-gcn,
title = "{D}ialogue{GCN}: A Graph Convolutional Neural Network for
Emotion Recognition in Conversation",
author = "Ghosal, Deepanway and Majumder, Navonil and Poria, Soujanya
and Chhaya, Niyati and Gelbukh, Alexander",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1015",
doi = "10.18653/v1/D19-1015",
pages = "154-164",
abstract = "Emotion recognition in conversation (ERC) has received much
attention, lately, from researchers due to its potential
widespread applications in diverse areas, such as
health-care, education, and human resources. In this paper,
we present Dialogue Graph Convolutional Network
(DialogueGCN), a graph neural network based approach to
ERC. We leverage self and inter-speaker dependency of the
interlocutors to model conversational context for emotion
recognition. Through the graph network, DialogueGCN addresses
context propagation issues present in the current RNN-based
methods. We empirically show that this method alleviates such
issues, while outperforming the current state of the art on a
number of benchmark emotion classification datasets."
}
@inproceedings{chen-2019-working-memory,
title = "A Working Memory Model for Task-oriented Dialog Response
Generation",
author = "Chen, Xiuyi and Xu, Jiaming and Xu, Bo",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1258",
doi = "10.18653/v1/P19-1258",
pages = "2687-2693",
abstract = "Recently, to incorporate external Knowledge Base (KB)
information, one form of world knowledge, several end-to-end
task-oriented dialog systems have been proposed. These
models, however, tend to confound the dialog history with KB
tuples and simply store them into one memory. Inspired by the
psychological studies on working memory, we propose a working
memory model (WMM2Seq) for dialog response generation. Our
WMM2Seq adopts a working memory to interact with two
separated long-term memories, which are the episodic memory
for memorizing dialog history and the semantic memory for
storing KB tuples. The working memory consists of a central
executive to attend to the aforementioned memories, and a
short-term storage system to store the {``}activated{''}
contents from the long-term memories. Furthermore, we
introduce a context-sensitive perceptual process for the
token representations of dialog history, and then feed them
into the episodic memory. Extensive experiments on two
task-oriented dialog datasets demonstrate that our WMM2Seq
significantly outperforms the state-of-the-art results in
several evaluation metrics."
}
@inproceedings{su-2019-utterance-rewriter,
title = "Improving Multi-turn Dialogue Modelling with Utterance
{R}e{W}riter",
author = "Su, Hui and Shen, Xiaoyu and Zhang, Rongzhi and Sun, Fei and
Hu, Pengwei and Niu, Cheng and Zhou, Jie",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1003",
doi = "10.18653/v1/P19-1003",
pages = "22-31",
abstract = "Recent research has achieved impressive results in
single-turn dialogue modelling. In the multi-turn setting,
however, current models are still far from satisfactory. One
major challenge is the frequently occurred coreference and
information omission in our daily conversation, making it
hard for machines to understand the real intention. In this
paper, we propose rewriting the human utterance as a
pre-process to help multi-turn dialgoue modelling. Each
utterance is first rewritten to recover all coreferred and
omitted information. The next processing steps are then
performed based on the rewritten utterance. To properly train
the utterance rewriter, we collect a new dataset with human
annotations and introduce a Transformer-based utterance
rewriting architecture using the pointer network. We show the
proposed architecture achieves remarkably good performance on
the utterance rewriting task. The trained utterance rewriter
can be easily integrated into online chatbots and brings
general improvement over different domains."
}
@inproceedings{ippolito-2019-decoding-methods,
title = "Comparison of Diverse Decoding Methods from Conditional
Language Models",
author = "Ippolito, Daphne and Kriz, Reno and Sedoc, Jo{\~a}o and
Kustikova, Maria and Callison-Burch, Chris",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1365",
doi = "10.18653/v1/P19-1365",
pages = "3752-3762",
abstract = "While conditional language models have greatly improved in
their ability to output high quality natural language, many
NLP applications benefit from being able to generate a
diverse set of candidate sequences. Diverse decoding
strategies aim to, within a given-sized candidate list, cover
as much of the space of high-quality outputs as possible,
leading to improvements for tasks that rerank and combine
candidate outputs. Standard decoding methods, such as beam
search, optimize for generating high likelihood sequences
rather than diverse ones, though recent work has focused on
increasing diversity in these methods. In this work, we
perform an extensive survey of decoding-time strategies for
generating diverse outputs from a conditional language
model. In addition, we present a novel method where we
over-sample candidates, then use clustering to remove similar
sequences, thus achieving high diversity without sacrificing
quality."
}
@inproceedings{qian-2019-daml,
title = "Domain Adaptive Dialog Generation via Meta Learning",
author = "Qian, Kun and Yu, Zhou",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1253",
doi = "10.18653/v1/P19-1253",
pages = "2639-2649",
abstract = "Domain adaptation is an essential task in dialog system
building because there are so many new dialog tasks created
for different needs every day. Collecting and annotating
training data for these new tasks is costly since it involves
real user interactions. We propose a domain adaptive dialog
generation method based on meta-learning (DAML). DAML is an
end-to-end trainable dialog system model that learns from
multiple rich-resource tasks and then adapts to new domains
with minimal training samples. We train a dialog system model
using multiple rich-resource single-domain dialog data by
applying the model-agnostic meta-learning algorithm to dialog
domain. The model is capable of learning a competitive dialog
system on a new domain with only a few training examples in
an efficient manner. The two-step gradient updates in DAML
enable the model to learn general features across multiple
tasks. We evaluate our method on a simulated dialog dataset
and achieve state-of-the-art performance, which is
generalizable to new tasks."
}
@inproceedings{sankar-2019-conversation-history,
title = "Do Neural Dialog Systems Use the Conversation History
Effectively? An Empirical Study",
author = "Sankar, Chinnadhurai and Subramanian, Sandeep and Pal, Chris
and Chandar, Sarath and Bengio, Yoshua",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1004",
doi = "10.18653/v1/P19-1004",
pages = "32-37",
abstract = "Neural generative models have been become increasingly
popular when building conversational agents. They offer
flexibility, can be easily adapted to new domains, and
require minimal domain engineering. A common criticism of
these systems is that they seldom understand or use the
available dialog history effectively. In this paper, we take
an empirical approach to understanding how these models use
the available dialog history by studying the sensitivity of
the models to artificially introduced unnatural changes or
perturbations to their context at test time. We experiment
with 10 different types of perturbations on 4 multi-turn
dialog datasets and find that commonly used neural dialog
architectures like recurrent and transformer-based seq2seq
models are rarely sensitive to most perturbations such as
missing or reordering utterances, shuffling words, etc. Also,
by open-sourcing our code, we believe that it will serve as a
useful diagnostic tool for evaluating dialog systems in the
future."
}
@inproceedings{quan-2019-gecor,
title = "{GECOR}: An End-to-End Generative Ellipsis and Co-reference
Resolution Model for Task-Oriented Dialogue",
author = "Quan, Jun and Xiong, Deyi and Webber, Bonnie and Hu,
Changjian",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1462",
doi = "10.18653/v1/D19-1462",
pages = "4547-4557",
abstract = "Ellipsis and co-reference are common and ubiquitous
especially in multi-turn dialogues. In this paper, we treat
the resolution of ellipsis and co-reference in dialogue as a
problem of generating omitted or referred expressions from
the dialogue context. We therefore propose a unified
end-to-end Generative Ellipsis and CO-reference Resolution
model (GECOR) in the context of dialogue. The model can
generate a new pragmatically complete user utterance by
alternating the generation and copy mode for each user
utterance. A multi-task learning framework is further
proposed to integrate the GECOR into an end-to-end
task-oriented dialogue. In order to train both the GECOR and
the multi-task learning framework, we manually construct a
new dataset on the basis of the public dataset CamRest676
with both ellipsis and co-reference annotation. On this
dataset, intrinsic evaluations on the resolution of ellipsis
and co-reference show that the GECOR model significantly
outperforms the sequence-to-sequence (seq2seq) baseline model
in terms of EM, BLEU and F1 while extrinsic evaluations on
the downstream dialogue task demonstrate that our multi-task
learning framework with GECOR achieves a higher success rate
of task completion than TSCP, a state-of-the-art end-to-end
task-oriented dialogue model."
}
@inproceedings{zhao-2018-zsdg,
title = "Zero-Shot Dialog Generation with Cross-Domain Latent Actions",
author = "Zhao, Tiancheng and Eskenazi, Maxine",
booktitle = "Proceedings of the 19th Annual {SIG}dial Meeting on Discourse
and Dialogue",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-5001",
doi = "10.18653/v1/W18-5001",
pages = "1-10",
abstract = "This paper introduces zero-shot dialog generation (ZSDG), as
a step towards neural dialog systems that can instantly
generalize to new situations with minimum data. ZSDG requires
an end-to-end generative dialog system to generalize to a new
domain for which only a domain description is provided and no
training dialogs are available. Then a novel learning
framework, Action Matching, is proposed. This algorithm can
learn a cross-domain embedding space that models the
semantics of dialog responses which in turn, enables a neural
dialog generation model to generalize to new domains. We
evaluate our methods on two datasets, a new synthetic dialog
dataset, and an existing human-human multi-domain dialog
dataset. Experimental results show that our method is able to
achieve superior performance in learning dialog models that
can rapidly adapt their behavior to new domains and suggests
promising future research."
}
@article{zhao-2018-unsupervised-dg,
author = "Tiancheng Zhao and Kyusong Lee and Maxine Esk{\'{e}}nazi",
title = "Unsupervised Discrete Sentence Representation Learning for
Interpretable Neural Dialog Generation",
journal = "CoRR",
volume = "abs/1804.08069",
year = 2018,
url = "http://arxiv.org/abs/1804.08069",
archivePrefix= "arXiv",
eprint = "1804.08069",
timestamp = "Mon, 13 Aug 2018 16:46:01 +0200",
biburl = "https://dblp.org/rec/journals/corr/abs-1804-08069.bib",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{shalyminov-2019-few-shot-dg,
title = "Few-Shot Dialogue Generation Without Annotated Data: A
Transfer Learning Approach",
author = "Shalyminov, Igor and Lee, Sungjin and Eshghi, Arash and
Lemon, Oliver",
booktitle = "Proceedings of the 20th Annual SIGdial Meeting on Discourse
and Dialogue",
month = sep,
year = 2019,
address = "Stockholm, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W19-5904",
doi = "10.18653/v1/W19-5904",
pages = "32-39",
abstract = "Learning with minimal data is one of the key challenges in
the development of practical, production-ready goal-oriented
dialogue systems. In a real-world enterprise setting where
dialogue systems are developed rapidly and are expected to
work robustly for an ever-growing variety of domains,
products, and scenarios, efficient learning from a limited
number of examples becomes indispensable. In this paper, we
introduce a technique to achieve state-of-the-art dialogue
generation performance in a few-shot setup, without using any
annotated data. We do this by leveraging background knowledge
from a larger, more highly represented dialogue source {---}
namely, the MetaLWOz dataset. We evaluate our model on the
Stanford Multi-Domain Dialogue Dataset, consisting of
human-human goal-oriented dialogues in in-car navigation,
appointment scheduling, and weather information domains. We
show that our few-shot approach achieves state-of-the art
results on that dataset by consistently outperforming the
previous best model in terms of BLEU and Entity F1 scores,
while being more data-efficient than it by not requiring any
data annotation."
}
@inproceedings{lei-2018-sequicity,
title = "{S}equicity: Simplifying Task-oriented Dialogue Systems with
Single Sequence-to-Sequence Architectures",
author = "Lei, Wenqiang and Jin, Xisen and Kan, Min-Yen and Ren,
Zhaochun and He, Xiangnan and Yin, Dawei",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1133",
doi = "10.18653/v1/P18-1133",
pages = "1437-1447",
abstract = "Existing solutions to task-oriented dialogue systems follow
pipeline designs which introduces architectural complexity
and fragility. We propose a novel, holistic, extendable
framework based on a single sequence-to-sequence (seq2seq)
model which can be optimized with supervised or reinforcement
learning. A key contribution is that we design text spans
named belief spans to track dialogue believes, allowing
task-oriented dialogue systems to be modeled in a seq2seq
way. Based on this, we propose a simplistic Two Stage CopyNet
instantiation which emonstrates good scalability:
significantly reducing model complexity in terms of number of
parameters and training time by a magnitude. It significantly
outperforms state-of-the-art pipeline-based methods on large
datasets and retains a satisfactory entity match rate on
out-of-vocabulary (OOV) cases where pipeline-designed
competitors totally fail."
}
@article{liu-2019-nmrc-methods,
author = "Liu, Shanshan and Zhang, Xin and Zhang, Sheng and Wang, Hui
and Zhang, Weiming",
title = "Neural Machine Reading Comprehension: Methods and Trends",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1907.01118v5",
abstract = "Machine reading comprehension (MRC), which requires a machine
to answer questions based on a given context, has attracted
increasing attention with the incorporation of various
deep-learning techniques over the past few years. Although
research on MRC based on deep learning is flourishing, there
remains a lack of a comprehensive survey summarizing existing
approaches and recent trends, which motivated the work
presented in this article. Specifically, we give a thorough
review of this research field, covering different aspects
including (1) typical MRC tasks: their definitions,
differences, and representative datasets; (2) the general
architecture of neural MRC: the main modules and prevalent
approaches to each; and (3) new trends: some emerging areas
in neural MRC as well as the corresponding
challenges. Finally, considering what has been achieved so
far, the survey also envisages what the future may hold by
discussing the open issues left to be addressed.",
archivePrefix= "arXiv",
eprint = "1907.01118",
primaryClass = "cs.CL"
}
@phdthesis{chen-2018-nrc-beyond,
title = "Neural reading comprehension and beyond",
author = "Chen, Danqi",
year = 2018,
school = "Stanford University"
}
@inproceedings{trotman-2014-improve-bm25,
author = "Trotman, Andrew and Puurula, Antti and Burgess, Blake",
title = "Improvements to BM25 and Language Models Examined",
year = 2014,
isbn = 9781450330008,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2682862.2682863",
doi = "10.1145/2682862.2682863",
booktitle = "Proceedings of the 2014 Australasian Document Computing
Symposium",
pages = "58–65",
numpages = 8,
keywords = "Procrastination, Document Retrieval, Relevance Ranking",
location = "Melbourne, VIC, Australia",
series = "ADCS ’14"
}
@article{nogueira-2019-bert-re-ranking,
author = "Nogueira, Rodrigo and Cho, Kyunghyun",
title = "Passage Re-Ranking With Bert",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1901.04085v5",
abstract = "Recently, neural models pretrained on a language modeling
task, such as ELMo (Peters et al., 2017), OpenAI GPT (Radford
et al., 2018), and BERT (Devlin et al., 2018), have achieved
impressive results on various natural language processing
tasks such as question-answering and natural language
inference. In this paper, we describe a simple
re-implementation of BERT for query-based passage
re-ranking. Our system is the state of the art on the
TREC-CAR dataset and the top entry in the leaderboard of the
MS MARCO passage retrieval task, outperforming the previous
state of the art by 27 \% (relative) in MRR@10. The code to
reproduce our results is available at
https://github.com/nyu-dl/dl4marco-bert",
archivePrefix= "arXiv",
eprint = "1901.04085",
primaryClass = "cs.IR"
}
@article{bajaj-2016-ms-marco,
author = "Bajaj, Payal and Campos, Daniel and Craswell, Nick and Deng,
Li and Gao, Jianfeng and Liu, Xiaodong and Majumder, Rangan
and McNamara, Andrew and Mitra, Bhaskar and Nguyen, Tri and
Rosenberg, Mir and Song, Xia and Stoica, Alina and Tiwary,
Saurabh and Wang, Tong",
title = "Ms Marco: a Human Generated Machine Reading Comprehension
Dataset",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1611.09268v3",
abstract = "We introduce a large scale MAchine Reading COmprehension
dataset, which we name MS MARCO. The dataset comprises of
1,010,916 anonymized questions---sampled from Bing's search
query logs---each with a human generated answer and 182,669
completely human rewritten generated answers. In addition,
the dataset contains 8,841,823 passages---extracted from
3,563,535 web documents retrieved by Bing---that provide the
information necessary for curating the natural language
answers. A question in the MS MARCO dataset may have multiple
answers or no answers at all. Using this dataset, we propose
three different tasks with varying levels of difficulty: (i)
predict if a question is answerable given a set of context
passages, and extract and synthesize the answer as a human
would (ii) generate a well-formed answer (if possible) based
on the context passages that can be understood with the
question and passage context, and finally (iii) rank a set of
retrieved passages given a question. The size of the dataset
and the fact that the questions are derived from real user
search queries distinguishes MS MARCO from other well-known
publicly available datasets for machine reading comprehension
and question-answering. We believe that the scale and the
real-world nature of this dataset makes it attractive for
benchmarking machine reading comprehension and
question-answering models.",
archivePrefix= "arXiv",
eprint = "1611.09268",
primaryClass = "cs.CL"
}
@article{qiao-2019-bert-re-ranking,
author = "Qiao, Yifan and Xiong, Chenyan and Liu, Zhenghao and Liu,
Zhiyuan",
title = "Understanding the Behaviors of Bert in Ranking",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1904.07531v4",
abstract = "This paper studies the performances and behaviors of BERT in
ranking tasks. We explore several different ways to leverage
the pre-trained BERT and fine-tune it on two ranking tasks:
MS MARCO passage reranking and TREC Web Track ad hoc document
ranking. Experimental results on MS MARCO demonstrate the
strong effectiveness of BERT in question-answering focused
passage ranking tasks, as well as the fact that BERT is a
strong interaction-based seq2seq matching model. Experimental
results on TREC show the gaps between the BERT pre-trained on
surrounding contexts and the needs of ad hoc document
ranking. Analyses illustrate how BERT allocates its
attentions between query-document tokens in its Transformer
layers, how it prefers semantic matches between paraphrase
tokens, and how that differs with the soft match patterns
learned by a click-trained neural ranker.",
archivePrefix= "arXiv",
eprint = "1904.07531",
primaryClass = "cs.IR"
}
@article{pei-2019-re-ranking-recommendation,
author = "Pei, Changhua and Zhang, Yi and Zhang, Yongfeng and Sun, Fei
and Lin, Xiao and Sun, Hanxiao and Wu, Jian and Jiang, Peng
and Ou, Wenwu",
title = "Personalized Re-Ranking for Recommendation",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1904.06813v3",
abstract = "Ranking is a core task in recommender systems, which aims at
providing an ordered list of items to users. Typically, a
ranking function is learned from the labeled dataset to
optimize the global performance, which produces a ranking
score for each individual item. However, it may be
sub-optimal because the scoring function applies to each item
individually and does not explicitly consider the mutual
influence between items, as well as the differences of users'
preferences or intents. Therefore, we propose a personalized
re-ranking model for recommender systems. The proposed
re-ranking model can be easily deployed as a follow-up
modular after any ranking algorithm, by directly using the
existing ranking feature vectors. It directly optimizes the
whole recommendation list by employing a transformer
structure to efficiently encode the information of all items
in the list. Specifically, the Transformer applies a
self-attention mechanism that directly models the global
relationships between any pair of items in the whole list. We
confirm that the performance can be further improved by
introducing pre-trained embedding to learn personalized
encoding functions for different users. Experimental results
on both offline benchmarks and real-world online e-commerce
systems demonstrate the significant improvements of the
proposed re-ranking model.",
archivePrefix= "arXiv",
eprint = "1904.06813",
primaryClass = "cs.IR"
}
@inproceedings{kratzwald-2019-rankqa,
title = "{R}ank{QA}: Neural Question Answering with Answer Re-Ranking",
author = "Kratzwald, Bernhard and Eigenmann, Anna and Feuerriegel,
Stefan",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2019,
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1611",
doi = "10.18653/v1/P19-1611",
pages = "6076-6085",
abstract = "The conventional paradigm in neural question answering (QA)
for narrative content is limited to a two-stage process:
first, relevant text passages are retrieved and,
subsequently, a neural network for machine comprehension
extracts the likeliest answer. However, both stages are
largely isolated in the status quo and, hence, information
from the two phases is never properly fused. In contrast,
this work proposes RankQA: RankQA extends the conventional
two-stage process in neural QA with a third stage that
performs an additional answer re-ranking. The re-ranking
leverages different features that are directly extracted from
the QA pipeline, i.e., a combination of retrieval and
comprehension features. While our intentionally simple design
allows for an efficient, data-sparse estimation, it
nevertheless outperforms more complex QA systems by a
significant margin: in fact, RankQA achieves state-of-the-art
performance on 3 out of 4 benchmark datasets. Furthermore,
its performance is especially superior in settings where the
size of the corpus is dynamic. Here the answer re-ranking
provides an effective remedy against the underlying
noise-information trade-off due to a variable corpus size. As
a consequence, RankQA represents a novel, powerful, and thus
challenging baseline for future research in content-based
QA."
}
@article{guu-2020-realm,
author = "Guu, Kelvin and Lee, Kenton and Tung, Zora and Pasupat,
Panupong and Chang, Ming-Wei",
title = "Realm: Retrieval-Augmented Language Model Pre-Training",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2002.08909v1",
abstract = "Language model pre-training has been shown to capture a
surprising amount of world knowledge, crucial for NLP tasks
such as question answering. However, this knowledge is stored
implicitly in the parameters of a neural network, requiring
ever-larger networks to cover more facts. To capture
knowledge in a more modular and interpretable way, we augment
language model pre-training with a latent knowledge
retriever, which allows the model to retrieve and attend over
documents from a large corpus such as Wikipedia, used during
pre-training, fine-tuning and inference. For the first time,
we show how to pre-train such a knowledge retriever in an
unsupervised manner, using masked language modeling as the
learning signal and backpropagating through a retrieval step
that considers millions of documents. We demonstrate the
effectiveness of Retrieval-Augmented Language Model
pre-training (REALM) by fine-tuning on the challenging task
of Open-domain Question Answering (Open-QA). We compare
against state-of-the-art models for both explicit and
implicit knowledge storage on three popular Open-QA
benchmarks, and find that we outperform all previous methods
by a significant margin (4-16 \% absolute accuracy), while
also providing qualitative benefits such as interpretability
and modularity.",
archivePrefix= "arXiv",
eprint = "2002.08909",
primaryClass = "cs.CL"
}
@article{yang-2019-bert-ad-hoc-doc,
author = "Yang, Wei and Zhang, Haotian and Lin, Jimmy",
title = "Simple Applications of Bert for Ad Hoc Document Retrieval",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1903.10972v1",
abstract = "Following recent successes in applying BERT to question
answering, we explore simple applications to ad hoc document
retrieval. This required confronting the challenge posed by
documents that are typically longer than the length of input
BERT was designed to handle. We address this issue by
applying inference on sentences individually, and then
aggregating sentence scores to produce document
scores. Experiments on TREC microblog and newswire test
collections show that our approach is simple yet effective,
as we report the highest average precision on these datasets
by neural approaches that we are aware of.",
archivePrefix= "arXiv",
eprint = "1903.10972",
primaryClass = "cs.IR"
}
@article{kowsari-2017-hdltex,
author = "Kowsari, Kamran and Brown, Donald E. and Heidarysafa, Mojtaba
and Meimandi, Kiana Jafari and Gerber, Matthew S. and Barnes,
Laura E.",
title = "Hdltex: Hierarchical Deep Learning for Text Classification",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1709.08267v2",
abstract = "The continually increasing number of documents produced each
year necessitates ever improving information processing
methods for searching, retrieving, and organizing
text. Central to these information processing methods is
document classification, which has become an important
application for supervised learning. Recently the performance
of these traditional classifiers has degraded as the number
of documents has increased. This is because along with this
growth in the number of documents has come an increase in the
number of categories. This paper approaches this problem
differently from current document classification methods that
view the problem as multi-class classification. Instead we
perform hierarchical classification using an approach we call
Hierarchical Deep Learning for Text classification
(HDLTex). HDLTex employs stacks of deep learning
architectures to provide specialized understanding at each
level of the document hierarchy.",
archivePrefix= "arXiv",
eprint = "1709.08267",
primaryClass = "cs.LG"
}
@article{shen-2014-entity-linking-solution,
title = "Entity linking with a knowledge base: Issues, techniques, and
solutions",
author = "Shen, Wei and Wang, Jianyong and Han, Jiawei",
journal = "IEEE Transactions on Knowledge and Data Engineering",
volume = 27,
number = 2,
pages = "443-460",
year = 2014,
publisher = "IEEE"
}
@inproceedings{ehrlinger-2016-kg-definition,
added-at = "2017-12-16T11:15:46.000+0100",
author = "Ehrlinger, Lisa and W{\"o}{\ss}, Wolfram",
biburl =
"https://www.bibsonomy.org/bibtex/2bef3c699eeb69778c02467ccc13bc99c/thoni",
booktitle = "SEMANTiCS (Posters, Demos, SuCCESS)",
interhash = "33750938d78af869dd800db08b39c1b8",
intrahash = "bef3c699eeb69778c02467ccc13bc99c",
keywords = "knowledge graph defintion citedby:scholar:count:4
citedby:scholar:timestamp:2017-12-16",
timestamp = "2017-12-16T11:15:46.000+0100",
title = "Towards a Definition of Knowledge Graphs.",
year = 2016
}
@article{shen-2005-pairwise,
title = "Ranking and reranking with perceptron",
author = "Shen, Libin and Joshi, Aravind K",
journal = "Machine Learning",
volume = 60,
number = "1-3",
pages = "73-96",
year = 2005,
publisher = "Springer"
}
@inproceedings{cao-2007-listwise,
title = "Learning to rank: from pairwise approach to listwise
approach",
author = "Cao, Zhe and Qin, Tao and Liu, Tie-Yan and Tsai, Ming-Feng
and Li, Hang",
booktitle = "Proceedings of the 24th international conference on Machine
learning",
pages = "129-136",
year = 2007
}
@inproceedings{zheng-2010-learn-link,
title = "Learning to Link Entities with Knowledge Base",
author = "Zheng, Zhicheng and Li, Fangtao and Huang, Minlie and Zhu,
Xiaoyan",
booktitle = "Human Language Technologies: The 2010 Annual Conference of
the North {A}merican Chapter of the Association for
Computational Linguistics",
month = jun,
year = 2010,
address = "Los Angeles, California",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N10-1072",
pages = "483-491"
}
@inproceedings{chen-2011-collaborative-ranking,
title = "Collaborative Ranking: A Case Study on Entity Linking",
author = "Chen, Zheng and Ji, Heng",
booktitle = "Proceedings of the 2011 Conference on Empirical Methods in
Natural Language Processing",
month = jul,
year = 2011,
address = "Edinburgh, Scotland, UK.",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D11-1071",
pages = "771-781"
}
@inproceedings{han-2011-generative-el,
title = "A Generative Entity-Mention Model for Linking Entities with
Knowledge Base",
author = "Han, Xianpei and Sun, Le",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies",
month = jun,
year = 2011,
address = "Portland, Oregon, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P11-1095",
pages = "945-954"
}
@inproceedings{ngomo-2011-limes,
title = "LIMES—a time-efficient approach for large-scale link
discovery on the web of data",
author = "Ngomo, Axel-Cyrille Ngonga and Auer, S{\"o}ren",
booktitle = "Twenty-Second International Joint Conference on Artificial
Intelligence",
year = 2011
}
@article{sil-2017-cross-lingual-el,
author = "Sil, Avirup and Kundu, Gourab and Florian, Radu and Hamza,
Wael",
title = "Neural Cross-Lingual Entity Linking",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1712.01813v1",
abstract = "A major challenge in Entity Linking (EL) is making effective
use of contextual information to disambiguate mentions to
Wikipedia that might refer to different entities in different
contexts. The problem exacerbates with cross-lingual EL which
involves linking mentions written in non-English documents to
entries in the English Wikipedia: to compare textual clues
across languages we need to compute similarity between
textual fragments across languages. In this paper, we propose
a neural EL model that trains fine-grained similarities and
dissimilarities between the query and candidate document from
multiple perspectives, combined with convolution and tensor
networks. Further, we show that this English-trained system
can be applied, in zero-shot learning, to other languages by
making surprisingly effective use of multi-lingual
embeddings. The proposed system has strong empirical evidence
yielding state-of-the-art results in English as well as
cross-lingual: Spanish and Chinese TAC 2015 datasets.",
archivePrefix= "arXiv",
eprint = "1712.01813",
primaryClass = "cs.CL"
}
@inproceedings{hoffart-2011-robust-el,
title = "Robust Disambiguation of Named Entities in Text",
author = "Hoffart, Johannes and Yosef, Mohamed Amir and Bordino, Ilaria
and F{\"u}rstenau, Hagen and Pinkal, Manfred and Spaniol,
Marc and Taneva, Bilyana and Thater, Stefan and Weikum,
Gerhard",
booktitle = "Proceedings of the 2011 Conference on Empirical Methods in
Natural Language Processing",
month = jul,
year = 2011,
address = "Edinburgh, Scotland, UK.",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D11-1072",
pages = "782-792"
}
@inproceedings{sil-2013-re-ranking-joint-ner-el,
author = "Sil, Avirup and Yates, Alexander",
title = "Re-Ranking for Joint Named-Entity Recognition and Linking",
year = 2013,
isbn = 9781450322638,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2505515.2505601",
doi = "10.1145/2505515.2505601",
booktitle = "Proceedings of the 22nd ACM International Conference on
Information \& Knowledge Management",
pages = "2369–2374",
numpages = 6,
keywords = "named entity recognition, entity linking, entity
disambiguation",
location = "San Francisco, California, USA",
series = "CIKM ’13"
}
@inproceedings{guo-2013-to-link-not-to-link,
title = "To Link or Not to Link? A Study on End-to-End Tweet Entity
Linking",
author = "Guo, Stephen and Chang, Ming-Wei and Kiciman, Emre",
booktitle = "Proceedings of the 2013 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies",
month = jun,
year = 2013,
address = "Atlanta, Georgia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N13-1122",
pages = "1020-1030"
}
@inproceedings{pu-2010-structured-entities,
author = "Pu, Ken Q. and Hassanzadeh, Oktie and Drake, Richard and
Miller, Ren\'{e}e J.",
title = "Online Annotation of Text Streams with Structured Entities",
year = 2010,
isbn = 9781450300995,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/1871437.1871446",
doi = "10.1145/1871437.1871446",
booktitle = "Proceedings of the 19th ACM International Conference on
Information and Knowledge Management",
pages = "29–38",
numpages = 10,
keywords = "annotation, text stream, online, entity",
location = "Toronto, ON, Canada",
series = "CIKM ’10"
}
@inproceedings{zhang-2011-acronym-expansion-el,
author = "Zhang, Wei and Sim, Yan Chuan and Su, Jian and Tan, Chew Lim",
title = "Entity Linking with Effective Acronym Expansion, Instance
Selection and Topic Modeling",
year = 2011,
isbn = 9781577355151,
publisher = "AAAI Press",
booktitle = "Proceedings of the Twenty-Second International Joint
Conference on Artificial Intelligence - Volume Volume Three",
pages = "1909–1914",
numpages = 6,
location = "Barcelona, Catalonia, Spain",
series = "IJCAI’11"
}
@inproceedings{milne-2008-link-with-wiki,
author = "Milne, David and Witten, Ian H.",
title = "Learning to Link with Wikipedia",
year = 2008,
isbn = 9781595939913,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/1458082.1458150",
doi = "10.1145/1458082.1458150",
booktitle = "Proceedings of the 17th ACM Conference on Information and
Knowledge Management",
pages = "509–518",
numpages = 10,
keywords = "data mining, word sense disambiguation, wikipedia, semantic
annotation",
location = "Napa Valley, California, USA",
series = "CIKM ’08"
}
@inproceedings{ratinov-2011-local-global-wiki-el,
title = "Local and Global Algorithms for Disambiguation to
{W}ikipedia",
author = "Ratinov, Lev and Roth, Dan and Downey, Doug and Anderson,
Mike",
booktitle = "Proceedings of the 49th Annual Meeting of the Association for
Computational Linguistics: Human Language Technologies",
month = jun,
year = 2011,
address = "Portland, Oregon, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P11-1138",
pages = "1375-1384"
}
@inproceedings{radhakrishnan-2018-elden,
title = "{ELDEN}: Improved Entity Linking Using Densified Knowledge
Graphs",
author = "Radhakrishnan, Priya and Talukdar, Partha and Varma,
Vasudeva",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = 2018,
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1167",
doi = "10.18653/v1/N18-1167",
pages = "1844-1853",
abstract = "Entity Linking (EL) systems aim to automatically map mentions
of an entity in text to the corresponding entity in a
Knowledge Graph (KG). Degree of connectivity of an entity in
the KG directly affects an EL system{'}s ability to correctly
link mentions in text to the entity in KG. This causes many
EL systems to perform well for entities well connected to
other entities in KG, bringing into focus the role of KG
density in EL. In this paper, we propose Entity Linking using
Densified Knowledge Graphs (ELDEN). ELDEN is an EL system
which first densifies the KG with co-occurrence statistics
from a large text corpus, and then uses the densified KG to
train entity embeddings. Entity similarity measured using
these trained entity embeddings result in improved EL. ELDEN
outperforms state-of-the-art EL system on benchmark
datasets. Due to such densification, ELDEN performs well for
sparsely connected entities in the KG too. ELDEN{'}s approach
is simple, yet effective. We have made ELDEN{'}s code and
data publicly available."
}
@inproceedings{piccinno-2014-tagme-to-wat,
author = "Piccinno, Francesco and Ferragina, Paolo",
title = "From TagME to WAT: A New Entity Annotator",
year = 2014,
isbn = 9781450330237,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2633211.2634350",
doi = "10.1145/2633211.2634350",
booktitle = "Proceedings of the First International Workshop on Entity
Recognition \& Disambiguation",
pages = "55–62",
numpages = 8,
keywords = "graph-based algorithms, wikipedia, entity annotation, tagme",
location = "Gold Coast, Queensland, Australia",
series = "ERD ’14"
}
@inproceedings{yamada-2016-joint-learn-embedding-el,
title = "Joint Learning of the Embedding of Words and Entities for
Named Entity Disambiguation",
author = "Yamada, Ikuya and Shindo, Hiroyuki and Takeda, Hideaki and
Takefuji, Yoshiyasu",
booktitle = "Proceedings of The 20th {SIGNLL} Conference on Computational
Natural Language Learning",
month = aug,
year = 2016,
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/K16-1025",
doi = "10.18653/v1/K16-1025",
pages = "250-259"
}
@inproceedings{henzinger-2006-duplicate-web-pages,
title = "Finding near-duplicate web pages: a large-scale evaluation of
algorithms",
author = "Henzinger, Monika",
booktitle = "Proceedings of the 29th annual international ACM SIGIR
conference on Research and development in information
retrieval",
pages = "284-291",
year = 2006
}
@inproceedings{charikar-2002-simhash,
title = "Similarity estimation techniques from rounding algorithms",
author = "Charikar, Moses S",
booktitle = "Proceedings of the thiry-fourth annual ACM symposium on
Theory of computing",
pages = "380-388",
year = 2002
}
@article{reimers-2019-sentence-bert,
author = "Reimers, Nils and Gurevych, Iryna",
title = "Sentence-Bert: Sentence Embeddings Using Siamese
Bert-Networks",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1908.10084v1",
abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has
set a new state-of-the-art performance on sentence-pair
regression tasks like semantic textual similarity
(STS). However, it requires that both sentences are fed into
the network, which causes a massive computational overhead:
Finding the most similar pair in a collection of 10,000
sentences requires about 50 million inference computations
(~65 hours) with BERT. The construction of BERT makes it
unsuitable for semantic similarity search as well as for
unsupervised tasks like clustering. In this publication, we
present Sentence-BERT (SBERT), a modification of the
pretrained BERT network that use siamese and triplet network
structures to derive semantically meaningful sentence
embeddings that can be compared using cosine-similarity. This
reduces the effort for finding the most similar pair from 65
hours with BERT / RoBERTa to about 5 seconds with SBERT,
while maintaining the accuracy from BERT. We evaluate SBERT
and SRoBERTa on common STS tasks and transfer learning tasks,
where it outperforms other state-of-the-art sentence
embeddings methods.",
archivePrefix= "arXiv",
eprint = "1908.10084",
primaryClass = "cs.CL"
}
@article{guo-2017-drmm,
author = "Guo, Jiafeng and Fan, Yixing and Ai, Qingyao and Croft,
W. Bruce",
title = "A Deep Relevance Matching Model for Ad-Hoc Retrieval",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1711.08611v1",
abstract = "In recent years, deep neural networks have led to exciting
breakthroughs in speech recognition, computer vision, and
natural language processing (NLP) tasks. However, there have
been few positive results of deep models on ad-hoc retrieval
tasks. This is partially due to the fact that many important
characteristics of the ad-hoc retrieval task have not been
well addressed in deep models yet. Typically, the ad-hoc
retrieval task is formalized as a matching problem between
two pieces of text in existing work using deep models, and
treated equivalent to many NLP tasks such as paraphrase
identification, question answering and automatic
conversation. However, we argue that the ad-hoc retrieval
task is mainly about relevance matching while most NLP
matching tasks concern semantic matching, and there are some
fundamental differences between these two matching
tasks. Successful relevance matching requires proper handling
of the exact matching signals, query term importance, and
diverse matching requirements. In this paper, we propose a
novel deep relevance matching model (DRMM) for ad-hoc
retrieval. Specifically, our model employs a joint deep
architecture at the query term level for relevance
matching. By using matching histogram mapping, a feed forward
matching network, and a term gating network, we can
effectively deal with the three relevance matching factors
mentioned above. Experimental results on two representative
benchmark collections show that our model can significantly
outperform some well-known retrieval models as well as
state-of-the-art deep matching models.",
archivePrefix= "arXiv",
eprint = "1711.08611",
primaryClass = "cs.IR"
}
@inproceedings{hui-2017-pacrr,
title = "{PACRR}: A Position-Aware Neural {IR} Model for Relevance
Matching",
author = "Hui, Kai and Yates, Andrew and Berberich, Klaus and de Melo,
Gerard",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1110",
doi = "10.18653/v1/D17-1110",
pages = "1049-1058",
abstract = "In order to adopt deep learning for information retrieval,
models are needed that can capture all relevant information
required to assess the relevance of a document to a given
user query. While previous works have successfully captured
unigram term matches, how to fully employ position-dependent
information such as proximity and term dependencies has been
insufficiently explored. In this work, we propose a novel
neural IR model named PACRR aiming at better modeling
position-dependent interactions between a query and a
document. Extensive experiments on six years{'} TREC Web
Track data confirm that the proposed model yields better
results under multiple benchmarks."
}
@article{malkov-2016-hnsw,
author = "Malkov, Yu. A. and Yashunin, D. A.",
title = "Efficient and Robust Approximate Nearest Neighbor Search
Using Hierarchical Navigable Small World Graphs",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1603.09320v4",
abstract = "We present a new approach for the approximate K-nearest
neighbor search based on navigable small world graphs with
controllable hierarchy (Hierarchical NSW, HNSW). The proposed
solution is fully graph-based, without any need for
additional search structures, which are typically used at the
coarse search stage of the most proximity graph
techniques. Hierarchical NSW incrementally builds a
multi-layer structure consisting from hierarchical set of
proximity graphs (layers) for nested subsets of the stored
elements. The maximum layer in which an element is present is
selected randomly with an exponentially decaying probability
distribution. This allows producing graphs similar to the
previously studied Navigable Small World (NSW) structures
while additionally having the links separated by their
characteristic distance scales. Starting search from the
upper layer together with utilizing the scale separation
boosts the performance compared to NSW and allows a
logarithmic complexity scaling. Additional employment of a
heuristic for selecting proximity graph neighbors
significantly increases performance at high recall and in
case of highly clustered data. Performance evaluation has
demonstrated that the proposed general metric space search
index is able to strongly outperform previous opensource
state-of-the-art vector-only approaches. Similarity of the
algorithm to the skip list structure allows straightforward
balanced distributed implementation.",
archivePrefix= "arXiv",
eprint = "1603.09320",
primaryClass = "cs.DS"
}
@article{liu-2009-learning-to-rank,
title = "Learning to rank for information retrieval",
author = "Liu, Tie-Yan",
journal = "Foundations and trends in information retrieval",
volume = 3,
number = 3,
pages = "225-331",
year = 2009,
publisher = "Now Publishers Inc."
}
@article{marrero-2013-survey-ner,
author = "Marrero, M\'{o}nica and Urbano, Juli\'{a}n and
S\'{a}nchez-Cuadrado, Sonia and Morato, Jorge and
G\'{o}mez-Berb\'{\i}s, Juan Miguel",
journal = "Computer Standards \& Interfaces",
number = 5,
pages = "482-489",
title = "{Named Entity Recognition: Fallacies, Challenges and
Opportunities}",
volume = 35,
year = 2013
}
@inproceedings{dai-2018-complex-entity,
title = "Recognizing Complex Entity Mentions: A Review and Future
Directions",
author = "Dai, Xiang",
booktitle = "Proceedings of {ACL} 2018, Student Research Workshop",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-3006",
doi = "10.18653/v1/P18-3006",
pages = "37-44",
abstract = "Standard named entity recognizers can effectively recognize
entity mentions that consist of contiguous tokens and do not
overlap with each other. However, in practice, there are many
domains, such as the biomedical domain, in which there are
nested, overlapping, and discontinuous entity mentions. These
complex mentions cannot be directly recognized by
conventional sequence tagging models because they may break
the assumptions based on which sequence tagging techniques
are built. We review the existing methods which are revised
to tackle complex entity mentions and categorize them as
tokenlevel and sentence-level approaches. We then identify
the research gap, and discuss some directions that we are
exploring."
}
@article{goyal-2018-surney-ner,
title = "Recent Named Entity Recognition and Classification
techniques: A systematic review",
journal = "Computer Science Review",
volume = 29,
pages = "21-43",
year = 2018,
issn = "1574-0137",
doi = "https://doi.org/10.1016/j.cosrev.2018.06.001",
url =
"http://www.sciencedirect.com/science/article/pii/S1574013717302782",
author = "Archana Goyal and Vishal Gupta and Manish Kumar",
abstract = "Textual information is becoming available in abundance on the
web, arising the requirement of techniques and tools to
extract the meaningful information. One of such an important
information extraction task is Named Entity Recognition and
Classification. It is the problem of finding the members of
various predetermined classes, such as person, organization,
location, date/time, quantities, numbers etc. The concept of
named entity extraction was first proposed in Sixth Message
Understanding Conference in 1996. Since then, a number of
techniques have been developed by many researchers for
extracting diversity of entities from different languages and
genres of text. Still, there is a growing interest among
research community to develop more new approaches to extract
diverse named entities which are helpful in various natural
language applications. Here we present a survey of
developments and progresses made in Named Entity Recognition
and Classification research."
}
@article{wang-2018-sv-guided-softmax,
author = "Wang, Xiaobo and Wang, Shuo and Zhang, Shifeng and Fu, Tianyu
and Shi, Hailin and Mei, Tao",
title = "Support Vector Guided Softmax Loss for Face Recognition",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1812.11317v1",
abstract = "Face recognition has witnessed significant progresses due to
the advances of deep convolutional neural networks (CNNs),
the central challenge of which, is feature discrimination. To
address it, one group tries to exploit mining-based
strategies (\textit{e.g.}, hard example mining and focal
loss) to focus on the informative examples. The other group
devotes to designing margin-based loss functions
(\textit{e.g.}, angular, additive and additive angular
margins) to increase the feature margin from the perspective
of ground truth class. Both of them have been well-verified
to learn discriminative features. However, they suffer from
either the ambiguity of hard examples or the lack of
discriminative power of other classes. In this paper, we
design a novel loss function, namely support vector guided
softmax loss (SV-Softmax), which adaptively emphasizes the
mis-classified points (support vectors) to guide the
discriminative features learning. So the developed SV-Softmax
loss is able to eliminate the ambiguity of hard examples as
well as absorb the discriminative power of other classes, and
thus results in more discrimiantive features. To the best of
our knowledge, this is the first attempt to inherit the
advantages of mining-based and margin-based losses into one
framework. Experimental results on several benchmarks have
demonstrated the effectiveness of our approach over
state-of-the-arts.",
archivePrefix= "arXiv",
eprint = "1812.11317",
primaryClass = "cs.CV"
}
@inproceedings{pan-2015-unsupervised-el,
title = "Unsupervised Entity Linking with {A}bstract {M}eaning
{R}epresentation",
author = "Pan, Xiaoman and Cassidy, Taylor and Hermjakob, Ulf and Ji,
Heng and Knight, Kevin",
booktitle = "Proceedings of the 2015 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies",
month = may # "{--}" # jun,
year = 2015,
address = "Denver, Colorado",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N15-1119",
doi = "10.3115/v1/N15-1119",
pages = "1130-1139"
}
@inproceedings{banarescu-2013-amr,
title = "{A}bstract {M}eaning {R}epresentation for Sembanking",
author = "Banarescu, Laura and Bonial, Claire and Cai, Shu and
Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and
Knight, Kevin and Koehn, Philipp and Palmer, Martha and
Schneider, Nathan",
booktitle = "Proceedings of the 7th Linguistic Annotation Workshop and
Interoperability with Discourse",
month = aug,
year = 2013,
address = "Sofia, Bulgaria",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W13-2322",
pages = "178-186"
}
@article{wang-2015-faq-based,
author = "Wang, Zhiguo and Ittycheriah, Abraham",
title = "Faq-Based Question Answering Via Word Alignment",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1507.02628v1",
abstract = "In this paper, we propose a novel word-alignment-based method
to solve the FAQ-based question answering task. First, we
employ a neural network model to calculate question
similarity, where the word alignment between two questions is
used for extracting features. Second, we design a
bootstrap-based feature extraction method to extract a small
set of effective lexical features. Third, we propose a
learning-to-rank algorithm to train parameters more suitable
for the ranking tasks. Experimental results, conducted on
three languages (English, Spanish and Japanese), demonstrate
that the question similarity model is more effective than
baseline systems, the sparse features bring 5 \% improvements
on top-1 accuracy, and the learning-to-rank algorithm works
significantly better than the traditional method. We further
evaluate our method on the answer sentence selection
task. Our method outperforms all the previous systems on the
standard TREC data set.",
archivePrefix= "arXiv",
eprint = "1507.02628",
primaryClass = "cs.CL"
}
@inproceedings{song-2007-question-similarity,
title = "Question similarity calculation for FAQ answering",
author = "Song, Wanpeng and Feng, Min and Gu, Naijie and Wenyin, Liu",
booktitle = "Third International Conference on Semantics, Knowledge and
Grid (SKG 2007)",
pages = "298-301",
year = 2007,
organization = "IEEE"
}
@inproceedings{bhardwaj-2016-faq,
title = "Question answering system for frequently asked questions",
author = "Bhardwaj, Divyanshu and Pakray, Partha and Bentham, Jereemi
and Saha, Saurav and Mizoram, NIT and Gelbukh, Alexander",
booktitle = "of the Final Workshop 7 December 2016, Naples",
pages = 129,
year = 2016
}
@article{minaee-2017-similarity-qa,
author = "Minaee, Shervin and Liu, Zhu",
title = "Automatic Question-Answering Using a Deep Similarity Neural
Network",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1708.01713v1",
abstract = "Automatic question-answering is a classical problem in
natural language processing, which aims at designing systems
that can automatically answer a question, in the same way as
human does. In this work, we propose a deep learning based
model for automatic question-answering. First the questions
and answers are embedded using neural probabilistic
modeling. Then a deep similarity neural network is trained to
find the similarity score of a pair of answer and
question. Then for each question, the best answer is found as
the one with the highest similarity score. We first train
this model on a large-scale public question-answering
database, and then fine-tune it to transfer to the
customer-care chat data. We have also tested our framework on
a public question-answering database and achieved very good
performance.",
archivePrefix= "arXiv",
eprint = "1708.01713",
primaryClass = "cs.CL"
}
@article{sharma-2018-qa-system,
title = "Deep Learning Approaches for Question Answering System",
journal = "Procedia Computer Science",
volume = 132,
pages = "785-794",
year = 2018,
note = "International Conference on Computational Intelligence and
Data Science",
issn = "1877-0509",
doi = "https://doi.org/10.1016/j.procs.2018.05.090",
url =
"http://www.sciencedirect.com/science/article/pii/S1877050918308226",
author = "Yashvardhan Sharma and Sahil Gupta",
keywords = "coattention, deep learning, memory nets, neural networks,
question answering, word vectors",
abstract = "Question Answering (QA) System is very useful as most of the
deep learning related problems can be modeled as a question
answering problem. Consequently, the field is one of the most
researched fields in computer science today. The last few
years have seen considerable developments and improvement in
the state of the art, much of which can be credited to
upcoming of Deep Learning. In this paper, a discussion about
various approaches starting from the basic NLP and algorithms
based approach has been done and the paper eventually builds
towards the recently proposed methods of Deep
Learning. Implementation details and various tweaks in the
algorithms that produced better results have also been
discussed. The evaluation of the proposed models was done on
twenty tasks of babI dataset of Facebook."
}
@inproceedings{lai-2018-answer-selection,
title = "A Review on Deep Learning Techniques Applied to Answer
Selection",
author = "Lai, Tuan Manh and Bui, Trung and Li, Sheng",
booktitle = "Proceedings of the 27th International Conference on
Computational Linguistics",
month = aug,
year = 2018,
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-1181",
pages = "2132-2144",
abstract = "Given a question and a set of candidate answers, answer
selection is the task of identifying which of the candidates
answers the question correctly. It is an important problem in
natural language processing, with applications in many
areas. Recently, many deep learning based methods have been
proposed for the task. They produce impressive performance
without relying on any feature engineering or expensive
external resources. In this paper, we aim to provide a
comprehensive review on deep learning methods applied to
answer selection."
}
@article{feng-2015-answer-selection,
author = "Feng, Minwei and Xiang, Bing and Glass, Michael R. and Wang,
Lidan and Zhou, Bowen",
title = "Applying Deep Learning To Answer Selection: a Study and an
Open Task",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1508.01585v2",
abstract = "We apply a general deep learning framework to address the
non-factoid question answering task. Our approach does not
rely on any linguistic tools and can be applied to different
languages or domains. Various architectures are presented and
compared. We create and release a QA corpus and setup a new
QA task in the insurance domain. Experimental results
demonstrate superior performance compared to the baseline
methods and various technologies give further
improvements. For this highly challenging task, the top-1
accuracy can reach up to 65.3 \% on a test set, which
indicates a great potential for practical use.",
archivePrefix= "arXiv",
eprint = "1508.01585",
primaryClass = "cs.CL"
}
@article{tan-2015-lstm-answer-selection,
author = "Tan, Ming and Santos, Cicero dos and Xiang, Bing and Zhou,
Bowen",
title = "Lstm-Based Deep Learning Models for Non-Factoid Answer
Selection",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1511.04108v4",
abstract = "In this paper, we apply a general deep learning (DL)
framework for the answer selection task, which does not
depend on manually defined features or linguistic tools. The
basic framework is to build the embeddings of questions and
answers based on bidirectional long short-term memory
(biLSTM) models, and measure their closeness by cosine
similarity. We further extend this basic model in two
directions. One direction is to define a more composite
representation for questions and answers by combining
convolutional neural network with the basic framework. The
other direction is to utilize a simple but efficient
attention mechanism in order to generate the answer
representation according to the question context. Several
variations of models are provided. The models are examined by
two datasets, including TREC-QA and InsuranceQA. Experimental
results demonstrate that the proposed models substantially
outperform several strong baselines.",
archivePrefix= "arXiv",
eprint = "1511.04108",
primaryClass = "cs.CL"
}
@inproceedings{wang-2016-inner-attention-answer-selection,
title = "Inner Attention based Recurrent Neural Networks for Answer
Selection",
author = "Wang, Bingning and Liu, Kang and Zhao, Jun",
booktitle = "Proceedings of the 54th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = 2016,
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P16-1122",
doi = "10.18653/v1/P16-1122",
pages = "1288-1297"
}
@article{wang-2016-compare-aggregate,
author = "Wang, Shuohang and Jiang, Jing",
title = "A Compare-Aggregate Model for Matching Text Sequences",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1611.01747v1",
abstract = "Many NLP tasks including machine comprehension, answer
selection and text entailment require the comparison between
sequences. Matching the important units between sequences is
a key to solve these problems. In this paper, we present a
general ``compare-aggregate`` framework that performs
word-level matching followed by aggregation using
Convolutional Neural Networks. We particularly focus on the
different comparison functions we can use to match two
vectors. We use four different datasets to evaluate the
model. We find that some simple comparison functions based on
element-wise operations can work better than standard neural
network and neural tensor network.",
archivePrefix= "arXiv",
eprint = "1611.01747",
primaryClass = "cs.CL"
}
@ARTICLE{parikh-2016-decomposable-attention,
adsnote = "Provided by the SAO/NASA Astrophysics Data System",
adsurl = "http://adsabs.harvard.edu/abs/2016arXiv160601933P",
archivePrefix= "arXiv",
author = "{Parikh}, A.~P. and {T{\"a}ckstr{\"o}m}, O. and {Das}, D. and
{Uszkoreit}, J.",
eprint = "1606.01933",
journal = "ArXiv e-prints",
keywords = "Computer Science - Computation and Language",
month = jun,
primaryClass = "cs.CL",
title = "{A Decomposable Attention Model for Natural Language
Inference}",
year = 2016
}
@article{wang-2017-bimpm,
author = "Zhiguo Wang and Wael Hamza and Radu Florian",
title = "Bilateral Multi-Perspective Matching for Natural Language
Sentences",
journal = "CoRR",
volume = "abs/1702.03814",
year = 2017,
url = "http://arxiv.org/abs/1702.03814",
archivePrefix= "arXiv",
eprint = "1702.03814",
timestamp = "Mon, 13 Aug 2018 16:47:19 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/WangHF17",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@inproceedings{wang-2016-lexical-decomposition-composition,
title = "Sentence Similarity Learning by Lexical Decomposition and
Composition",
author = "Wang, Zhiguo and Mi, Haitao and Ittycheriah, Abraham",
booktitle = "Proceedings of {COLING} 2016, the 26th International
Conference on Computational Linguistics: Technical Papers",
month = dec,
year = 2016,
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://www.aclweb.org/anthology/C16-1127",
pages = "1340-1349",
abstract = "Most conventional sentence similarity methods only focus on
similar parts of two input sentences, and simply ignore the
dissimilar parts, which usually give us some clues and
semantic meanings about the sentences. In this work, we
propose a model to take into account both the similarities
and dissimilarities by decomposing and composing lexical
semantics over sentences. The model represents each word as a
vector, and calculates a semantic matching vector for each
word based on all words in the other sentence. Then, each
word vector is decomposed into a similar component and a
dissimilar component based on the semantic matching
vector. After this, a two-channel CNN model is employed to
capture features by composing the similar and dissimilar
components. Finally, a similarity score is estimated over the
composed feature vectors. Experimental results show that our
model gets the state-of-the-art performance on the answer
sentence selection task, and achieves a comparable result on
the paraphrase identification task."
}
@ARTICLE{chen-2016-esim,
author = "{Chen}, Qian and {Zhu}, Xiaodan and {Ling}, Zhenhua and
{Wei}, Si and {Jiang}, Hui and {Inkpen}, Diana",
title = "{Enhanced LSTM for Natural Language Inference}",
journal = "arXiv e-prints",
keywords = "Computer Science - Computation and Language",
year = 2016,
month = sep,
eid = "arXiv:1609.06038",
pages = "arXiv:1609.06038",
archivePrefix= "arXiv",
eprint = "1609.06038",
primaryClass = "cs.CL",
adsurl = "https://ui.adsabs.harvard.edu/abs/2016arXiv160906038C",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{shen-2017-inter-weighted-alignment,
title = "Inter-Weighted Alignment Network for Sentence Pair Modeling",
author = "Shen, Gehui and Yang, Yunlun and Deng, Zhi-Hong",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1122",
doi = "10.18653/v1/D17-1122",
pages = "1179-1189",
abstract = "Sentence pair modeling is a crucial problem in the field of
natural language processing. In this paper, we propose a
model to measure the similarity of a sentence pair focusing
on the interaction information. We utilize the word level
similarity matrix to discover fine-grained alignment of two
sentences. It should be emphasized that each word in a
sentence has a different importance from the perspective of
semantic composition, so we exploit two novel and efficient
strategies to explicitly calculate a weight for each
word. Although the proposed model only use a sequential LSTM
for sentence modeling without any external resource such as
syntactic parser tree and additional lexicon features,
experimental results show that our model achieves
state-of-the-art performance on three datasets of two tasks."
}
@article{tay-2017-compare-compress-propagate,
author = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
title = "Compare, Compress and Propagate: Enhancing Neural
Architectures With Alignment Factorization for Natural
Language Inference",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1801.00102v2",
abstract = "This paper presents a new deep learning architecture for
Natural Language Inference (NLI). Firstly, we introduce a new
architecture where alignment pairs are compared, compressed
and then propagated to upper layers for enhanced
representation learning. Secondly, we adopt factorization
layers for efficient and expressive compression of alignment
vectors into scalar features, which are then used to augment
the base word representations. The design of our approach is
aimed to be conceptually simple, compact and yet powerful. We
conduct experiments on three popular benchmarks, SNLI,
MultiNLI and SciTail, achieving competitive performance on
all. A lightweight parameterization of our model also enjoys
a $\approx 3$ times reduction in parameter size compared to
the existing state-of-the-art models, e.g., ESIM and DIIN,
while maintaining competitive performance. Additionally,
visual analysis shows that our propagated features are highly
interpretable.",
archivePrefix= "arXiv",
eprint = "1801.00102",
primaryClass = "cs.CL"
}
@article{gong-2017-diin,
author = "Yichen Gong and Heng Luo and Jian Zhang",
title = "Natural Language Inference over Interaction Space",
journal = "CoRR",
volume = "abs/1709.04348",
year = 2017,
url = "http://arxiv.org/abs/1709.04348",
archivePrefix= "arXiv",
eprint = "1709.04348",
timestamp = "Mon, 13 Aug 2018 16:47:34 +0200",
biburl = "https://dblp.org/rec/bib/journals/corr/abs-1709-04348",
bibsource = "dblp computer science bibliography, https://dblp.org"
}
@article{tay-2018-multi-cast-attention,
author = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
title = "Multi-Cast Attention Networks for Retrieval-Based Question
Answering and Response Prediction",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1806.00778v1",
abstract = "Attention is typically used to select informative sub-phrases
that are used for prediction. This paper investigates the
novel use of attention as a form of feature augmentation,
i.e, casted attention. We propose Multi-Cast Attention
Networks (MCAN), a new attention mechanism and general model
architecture for a potpourri of ranking tasks in the
conversational modeling and question answering domains. Our
approach performs a series of soft attention operations, each
time casting a scalar feature upon the inner word
embeddings. The key idea is to provide a real-valued hint
(feature) to a subsequent encoder layer and is targeted at
improving the representation learning process. There are
several advantages to this design, e.g., it allows an
arbitrary number of attention mechanisms to be casted,
allowing for multiple attention types (e.g., co-attention,
intra-attention) and attention variants (e.g.,
alignment-pooling, max-pooling, mean-pooling) to be executed
simultaneously. This not only eliminates the costly need to
tune the nature of the co-attention layer, but also provides
greater extents of explainability to practitioners. Via
extensive experiments on four well-known benchmark datasets,
we show that MCAN achieves state-of-the-art performance. On
the Ubuntu Dialogue Corpus, MCAN outperforms existing
state-of-the-art models by $9\%$. MCAN also achieves the best
performing score to date on the well-studied TrecQA dataset.",
archivePrefix= "arXiv",
eprint = "1806.00778",
primaryClass = "cs.CL"
}
@inproceedings{tay-2018-csran,
title = "Co-Stack Residual Affinity Networks with Multi-level
Attention Refinement for Matching Text Sequences",
author = "Tay, Yi and Luu, Anh Tuan and Hui, Siu Cheung",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1479",
doi = "10.18653/v1/D18-1479",
pages = "4492-4502",
abstract = "Learning a matching function between two text sequences is a
long standing problem in NLP research. This task enables many
potential applications such as question answering and
paraphrase identification. This paper proposes Co-Stack
Residual Affinity Networks (CSRAN), a new and universal
neural architecture for this problem. CSRAN is a deep
architecture, involving stacked (multi-layered) recurrent
encoders. Stacked/Deep architectures are traditionally
difficult to train, due to the inherent weaknesses such as
difficulty with feature propagation and vanishing
gradients. CSRAN incorporates two novel components to take
advantage of the stacked architecture. Firstly, it introduces
a new bidirectional alignment mechanism that learns affinity
weights by fusing sequence pairs across stacked
hierarchies. Secondly, it leverages a multi-level attention
refinement component between stacked recurrent layers. The
key intuition is that, by leveraging information across all
network hierarchies, we can not only improve gradient flow
but also improve overall performance. We conduct extensive
experiments on six well-studied text sequence matching
datasets, achieving state-of-the-art performance on all."
}
@inproceedings{tan-2018-multiway-attention-mwan,
title = "Multiway Attention Networks for Modeling Sentence Pairs",
author = "Chuanqi Tan and Furu Wei and Wenhui Wang and Weifeng Lv and
Ming Zhou",
booktitle = "Proceedings of the Twenty-Seventh International Joint
Conference on Artificial Intelligence, {IJCAI-18}",
publisher = "International Joint Conferences on Artificial Intelligence
Organization",
pages = "4411-4417",
year = 2018,
month = 7,
doi = "10.24963/ijcai.2018/613",
url = "https://doi.org/10.24963/ijcai.2018/613"
}
@article{kim-2018-semantic-sentence-matching,
author = "Kim, Seonhoon and Kang, Inho and Kwak, Nojun",
title = "Semantic Sentence Matching With Densely-Connected Recurrent
and Co-Attentive Information",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1805.11360v2",
abstract = "Sentence matching is widely used in various natural language
tasks such as natural language inference, paraphrase
identification, and question answering. For these tasks,
understanding logical and semantic relationship between two
sentences is required but it is yet challenging. Although
attention mechanism is useful to capture the semantic
relationship and to properly align the elements of two
sentences, previous methods of attention mechanism simply use
a summation operation which does not retain original features
enough. Inspired by DenseNet, a densely connected
convolutional network, we propose a densely-connected
co-attentive recurrent neural network, each layer of which
uses concatenated information of attentive features as well
as hidden features of all the preceding recurrent layers. It
enables preserving the original and the co-attentive feature
information from the bottommost word embedding layer to the
uppermost recurrent layer. To alleviate the problem of an
ever-increasing size of feature vectors due to dense
concatenation operations, we also propose to use an
autoencoder after dense concatenation. We evaluate our
proposed architecture on highly competitive benchmark
datasets related to sentence matching. Experimental results
show that our architecture, which retains recurrent and
attentive features, achieves state-of-the-art performances
for most of the tasks.",
archivePrefix= "arXiv",
eprint = "1805.11360",
primaryClass = "cs.CL"
}
@inproceedings{pan-2018-discourse-marker,
title = "Discourse Marker Augmented Network with Reinforcement
Learning for Natural Language Inference",
author = "Pan, Boyuan and Yang, Yazheng and Zhao, Zhou and Zhuang,
Yueting and Cai, Deng and He, Xiaofei",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1091",
doi = "10.18653/v1/P18-1091",
pages = "989-999",
abstract = "Natural Language Inference (NLI), also known as Recognizing
Textual Entailment (RTE), is one of the most important
problems in natural language processing. It requires to infer
the logical relationship between two given sentences. While
current approaches mostly focus on the interaction
architectures of the sentences, in this paper, we propose to
transfer knowledge from some important discourse markers to
augment the quality of the NLI model. We observe that people
usually use some discourse markers such as {``}so{''} or
{``}but{''} to represent the logical relationship between two
sentences. These words potentially have deep connections with
the meanings of the sentences, thus can be utilized to help
improve the representations of them. Moreover, we use
reinforcement learning to optimize a new objective function
with a reward defined by the property of the NLI datasets to
make full use of the labels information. Experiments show
that our method achieves the state-of-the-art performance on
several large-scale datasets."
}
@article{zhang-2018-explicit-contextual-semantics,
author = "Zhang, Zhuosheng and Wu, Yuwei and Li, Zuchao and Zhao, Hai",
title = "Explicit Contextual Semantics for Text Comprehension",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1809.02794v3",
abstract = "Who did what to whom is a major focus in natural language
understanding, which is right the aim of semantic role
labeling (SRL) task. Despite of sharing a lot of processing
characteristics and even task purpose, it is surprisingly
that jointly considering these two related tasks was never
formally reported in previous work. Thus this paper makes the
first attempt to let SRL enhance text comprehension and
inference through specifying verbal predicates and their
corresponding semantic roles. In terms of deep learning
models, our embeddings are enhanced by explicit contextual
semantic role labels for more fine-grained semantics. We show
that the salient labels can be conveniently added to existing
models and significantly improve deep learning models in
challenging text comprehension tasks. Extensive experiments
on benchmark machine reading comprehension and inference
datasets verify that the proposed semantic learning helps our
system reach new state-of-the-art over strong baselines which
have been enhanced by well pretrained language models from
the latest progress.",
archivePrefix= "arXiv",
eprint = "1809.02794",
primaryClass = "cs.CL"
}
@article{leal-taixe-2016-siamese-cnn,
author = "Leal-Taix{\'e}, Laura and Ferrer, Cristian Canton and
Schindler, Konrad",
title = "Learning By Tracking: Siamese Cnn for Robust Target
Association",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1604.07866v3",
abstract = "This paper introduces a novel approach to the task of data
association within the context of pedestrian tracking, by
introducing a two-stage learning scheme to match pairs of
detections. First, a Siamese convolutional neural network
(CNN) is trained to learn descriptors encoding local
spatio-temporal structures between the two input image
patches, aggregating pixel values and optical flow
information. Second, a set of contextual features derived
from the position and size of the compared input patches are
combined with the CNN output by means of a gradient boosting
classifier to generate the final matching probability. This
learning approach is validated by using a linear programming
based multi-person tracker showing that even a simple and
efficient tracker may outperform much more complex models
when fed with our learned matching probabilities. Results on
publicly available sequences show that our method meets
state-of-the-art standards in multiple people tracking.",
archivePrefix= "arXiv",
eprint = "1604.07866",
primaryClass = "cs.LG"
}
@inproceedings{mueller-2016-siamese-lstm,
title = "Siamese recurrent architectures for learning sentence
similarity",
author = "Mueller, Jonas and Thyagarajan, Aditya",
booktitle = "thirtieth AAAI conference on artificial intelligence",
year = 2016
}
@article{conneau-2017-infer-sent,
author = "Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and
Barrault, Loic and Bordes, Antoine",
title = "Supervised Learning of Universal Sentence Representations
From Natural Language Inference Data",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1705.02364v5",
abstract = "Many modern NLP systems rely on word embeddings, previously
trained in an unsupervised manner on large corpora, as base
features. Efforts to obtain embeddings for larger chunks of
text, such as sentences, have however not been so
successful. Several attempts at learning unsupervised
representations of sentences have not reached satisfactory
enough performance to be widely adopted. In this paper, we
show how universal sentence representations trained using the
supervised data of the Stanford Natural Language Inference
datasets can consistently outperform unsupervised methods
like SkipThought vectors on a wide range of transfer
tasks. Much like how computer vision uses ImageNet to obtain
features, which can then be transferred to other tasks, our
work tends to indicate the suitability of naturajl language
inference for transfer learning to other NLP tasks. Our
encoder is publicly available.",
archivePrefix= "arXiv",
eprint = "1705.02364",
primaryClass = "cs.CL"
}
@article{nie-2017-sse,
author = "Nie, Yixin and Bansal, Mohit",
title = "Shortcut-Stacked Sentence Encoders for Multi-Domain
Inference",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1708.02312v2",
abstract = "We present a simple sequential sentence encoder for
multi-domain natural language inference. Our encoder is based
on stacked bidirectional LSTM-RNNs with shortcut connections
and fine-tuning of word embeddings. The overall supervised
model uses the above encoder to encode two input sentences
into two vectors, and then uses a classifier over the vector
combination to label the relationship between these two
sentences as that of entailment, contradiction, or
neural. Our Shortcut-Stacked sentence encoders achieve strong
improvements over existing encoders on matched and mismatched
multi-domain natural language inference (top non-ensemble
single-model result in the EMNLP RepEval 2017 Shared Task
(Nangia et al., 2017)). Moreover, they achieve the new
state-of-the-art encoding result on the original SNLI dataset
(Bowman et al., 2015).",
archivePrefix= "arXiv",
eprint = "1708.02312",
primaryClass = "cs.CL"
}
@inproceedings{zhou-2016-multi-view,
title = "Multi-view Response Selection for Human-Computer
Conversation",
author = "Zhou, Xiangyang and Dong, Daxiang and Wu, Hua and Zhao, Shiqi
and Yu, Dianhai and Tian, Hao and Liu, Xuan and Yan, Rui",
booktitle = "Proceedings of the 2016 Conference on Empirical Methods in
Natural Language Processing",
month = nov,
year = 2016,
address = "Austin, Texas",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D16-1036",
doi = "10.18653/v1/D16-1036",
pages = "372-381"
}
@inproceedings{lan-2018-neural-network,
title = "Neural Network Models for Paraphrase Identification, Semantic
Textual Similarity, Natural Language Inference, and Question
Answering",
author = "Lan, Wuwei and Xu, Wei",
booktitle = "Proceedings of the 27th International Conference on
Computational Linguistics",
month = aug,
year = 2018,
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/C18-1328",
pages = "3890-3902",
abstract = "In this paper, we analyze several neural network designs (and
their variations) for sentence pair modeling and compare
their performance extensively across eight datasets,
including paraphrase identification, semantic textual
similarity, natural language inference, and question
answering tasks. Although most of these models have claimed
state-of-the-art performance, the original papers often
reported on only one or two selected datasets. We provide a
systematic study and show that (i) encoding contextual
information by LSTM and inter-sentence interactions are
critical, (ii) Tree-LSTM does not help as much as previously
claimed but surprisingly improves performance on Twitter
datasets, (iii) the Enhanced Sequential Inference Model is
the best so far for larger datasets, while the Pairwise Word
Interaction Model achieves the best performance when less
data is available. We release our implementations as an
open-source toolkit."
}
@inproceedings{zhou-2018-dam,
title = "Multi-Turn Response Selection for Chatbots with Deep
Attention Matching Network",
author = "Zhou, Xiangyang and Li, Lu and Dong, Daxiang and Liu, Yi and
Chen, Ying and Zhao, Wayne Xin and Yu, Dianhai and Wu, Hua",
booktitle = "Proceedings of the 56th Annual Meeting of the Association for
Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = 2018,
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P18-1103",
doi = "10.18653/v1/P18-1103",
pages = "1118-1127",
abstract = "Human generates responses relying on semantic and functional
dependencies, including coreference relation, among dialogue
elements and their context. In this paper, we investigate
matching a response with its multi-turn context using
dependency information based entirely on attention. Our
solution is inspired by the recently proposed Transformer in
machine translation (Vaswani et al., 2017) and we extend the
attention mechanism in two ways. First, we construct
representations of text segments at different granularities
solely with stacked self-attention. Second, we try to extract
the truly matched segment pairs with attention across the
context and response. We jointly introduce those two kinds of
attention in one uniform neural network. Experiments on two
large-scale multi-turn response selection tasks show that our
proposed model significantly outperforms the state-of-the-art
models."
}
@inproceedings{rao-2019-hcan,
title = "Bridging the Gap between Relevance Matching and Semantic
Matching for Short Text Similarity Modeling",
author = "Rao, Jinfeng and Liu, Linqing and Tay, Yi and Yang, Wei and
Shi, Peng and Lin, Jimmy",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1540",
doi = "10.18653/v1/D19-1540",
pages = "5370-5381",
abstract = "A core problem of information retrieval (IR) is relevance
matching, which is to rank documents by relevance to a
user{'}s query. On the other hand, many NLP problems, such as
question answering and paraphrase identification, can be
considered variants of semantic matching, which is to measure
the semantic distance between two pieces of short
texts. While at a high level both relevance and semantic
matching require modeling textual similarity, many existing
techniques for one cannot be easily adapted to the other. To
bridge this gap, we propose a novel model, HCAN (Hybrid
Co-Attention Network), that comprises (1) a hybrid encoder
module that includes ConvNet-based and LSTM-based encoders,
(2) a relevance matching module that measures soft term
matches with importance weighting at multiple granularities,
and (3) a semantic matching module with co-attention
mechanisms that capture context-aware semantic
relatedness. Evaluations on multiple IR and NLP benchmarks
demonstrate state-of-the-art effectiveness compared to
approaches that do not exploit pretraining on external
data. Extensive ablation studies suggest that relevance and
semantic matching signals are complementary across many
problem settings, regardless of the choice of underlying
encoders."
}
@article{cui-2018-cmrc-2018,
author = "Cui, Yiming and Liu, Ting and Che, Wanxiang and Xiao, Li and
Chen, Zhipeng and Ma, Wentao and Wang, Shijin and Hu,
Guoping",
title = "A Span-Extraction Dataset for Chinese Machine Reading
Comprehension",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1810.07366v2",
abstract = "Machine Reading Comprehension (MRC) has become enormously
popular recently and has attracted a lot of
attention. However, the existing reading comprehension
datasets are mostly in English. In this paper, we introduce a
Span-Extraction dataset for Chinese machine reading
comprehension to add language diversities in this area. The
dataset is composed by near 20,000 real questions annotated
on Wikipedia paragraphs by human experts. We also annotated a
challenge set which contains the questions that need
comprehensive understanding and multi-sentence inference
throughout the context. We present several baseline systems
as well as anonymous submissions for demonstrating the
difficulties in this dataset. With the release of the
dataset, we hosted the Second Evaluation Workshop on Chinese
Machine Reading Comprehension (CMRC 2018). We hope the
release of the dataset could further accelerate the Chinese
machine reading comprehension research. Resources are
available: https://github.com/ymcui/cmrc2018",
archivePrefix= "arXiv",
eprint = "1810.07366",
primaryClass = "cs.CL"
}
@article{cui-2020-cmrc-2019,
author = "Cui, Yiming and Liu, Ting and Yang, Ziqing and Chen, Zhipeng
and Ma, Wentao and Che, Wanxiang and Wang, Shijin and Hu,
Guoping",
title = "A Sentence Cloze Dataset for Chinese Machine Reading
Comprehension",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2004.03116v1",
abstract = "Owing to the continuous contributions by the Chinese NLP
community, more and more Chinese machine reading
comprehension datasets become available, and they have been
pushing Chinese MRC research forward. To add diversity in
this area, in this paper, we propose a new task called
Sentence Cloze-style Machine Reading Comprehension
(SC-MRC). The proposed task aims to fill the right candidate
sentence into the passage that has several blanks. Moreover,
to add more difficulties, we also made fake candidates that
are similar to the correct ones, which requires the machine
to judge their correctness in the context. The proposed
dataset contains over 100K blanks (questions) within over 10K
passages, which was originated from Chinese narrative
stories. To evaluate the dataset, we implement several
baseline systems based on pre-trained models, and the results
show that the state-of-the-art model still underperforms
human performance by a large margin. We hope the release of
the dataset could further accelerate the machine reading
comprehension research. Resources available:
https://github.com/ymcui/cmrc2019",
archivePrefix= "arXiv",
eprint = "2004.03116",
primaryClass = "cs.CL"
}
@article{munkhdalai-2016-neural-tree-indexers,
author = "Munkhdalai, Tsendsuren and Yu, Hong",
title = "Neural Tree Indexers for Text Understanding",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1607.04492v2",
abstract = "Recurrent neural networks (RNNs) process input text
sequentially and model the conditional transition between
word tokens. In contrast, the advantages of recursive
networks include that they explicitly model the
compositionality and the recursive structure of natural
language. However, the current recursive architecture is
limited by its dependence on syntactic tree. In this paper,
we introduce a robust syntactic parsing-independent tree
structured model, Neural Tree Indexers (NTI) that provides a
middle ground between the sequential RNNs and the syntactic
treebased recursive models. NTI constructs a full n-ary tree
by processing the input text with its node function in a
bottom-up fashion. Attention mechanism can then be applied
to both structure and node function. We implemented and
evaluated a binarytree model of NTI, showing the model
achieved the state-of-the-art performance on three different
NLP tasks: natural language inference, answer sentence
selection, and sentence classification, outperforming
state-of-the-art recurrent and recursive neural networks.",
archivePrefix= "arXiv",
eprint = "1607.04492",
primaryClass = "cs.CL"
}
@inproceedings{grandvalet-2004-entropy-minimization,
author = "Grandvalet, Yves and Bengio, Yoshua",
title = "Semi-Supervised Learning by Entropy Minimization",
year = 2004,
publisher = "MIT Press",
address = "Cambridge, MA, USA",
booktitle = "Proceedings of the 17th International Conference on Neural
Information Processing Systems",
pages = "529–536",
numpages = 8,
location = "Vancouver, British Columbia, Canada",
series = "NIPS’04"
}
@inproceedings{duan-2017-qg-for-qa,
title = "Question Generation for Question Answering",
author = "Duan, Nan and Tang, Duyu and Chen, Peng and Zhou, Ming",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in
Natural Language Processing",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1090",
doi = "10.18653/v1/D17-1090",
pages = "866-874",
abstract = "This paper presents how to generate questions from given
passages using neural networks, where large scale QA pairs
are automatically crawled and processed from Community-QA
website, and used as training data. The contribution of the
paper is 2-fold: First, two types of question generation
approaches are proposed, one is a retrieval-based method
using convolution neural network (CNN), the other is a
generation-based method using recurrent neural network (RNN);
Second, we show how to leverage the generated questions to
improve existing question answering systems. We evaluate our
question generation method for the answer sentence selection
task on three benchmark datasets, including SQuAD, MS MARCO,
and WikiQA. Experimental results show that, by using
generated questions as an extra signal, significant QA
improvement can be achieved."
}
@inproceedings{hadsell-2006-contrastive-loss,
title = "Dimensionality reduction by learning an invariant mapping",
author = "Hadsell, Raia and Chopra, Sumit and LeCun, Yann",
booktitle = "2006 IEEE Computer Society Conference on Computer Vision and
Pattern Recognition (CVPR'06)",
volume = 2,
pages = "1735-1742",
year = 2006,
organization = "IEEE"
}
@article{wieting-2019-no-training-required,
author = "Wieting, John and Kiela, Douwe",
title = "No Training Required: Exploring Random Encoders for Sentence
Classification",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1901.10444v1",
abstract = "We explore various methods for computing sentence
representations from pre-trained word embeddings without any
training, i.e., using nothing but random
parameterizations. Our aim is to put sentence embeddings on
more solid footing by 1) looking at how much modern sentence
embeddings gain over random methods---as it turns out,
surprisingly little; and by 2) providing the field with more
appropriate baselines going forward---which are, as it turns
out, quite strong. We also make important observations about
proper experimental protocol for sentence classification
evaluation, together with recommendations for future
research.",
archivePrefix= "arXiv",
eprint = "1901.10444",
primaryClass = "cs.CL"
}
@inproceedings{mohtarami-2016-sls-semeval-task,
title = "{SLS} at {S}em{E}val-2016 Task 3: Neural-based Approaches for
Ranking in Community Question Answering",
author = "Mohtarami, Mitra and Belinkov, Yonatan and Hsu, Wei-Ning and
Zhang, Yu and Lei, Tao and Bar, Kfir and Cyphers, Scott and
Glass, Jim",
booktitle = "Proceedings of the 10th International Workshop on Semantic
Evaluation ({S}em{E}val-2016)",
month = jun,
year = 2016,
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/S16-1128",
doi = "10.18653/v1/S16-1128",
pages = "828-835"
}
@inproceedings{romeo-2016-neural-attention,
title = "Neural Attention for Learning to Rank Questions in Community
Question Answering",
author = "Romeo, Salvatore and Da San Martino, Giovanni and
Barr{\'o}n-Cede{\~n}o, Alberto and Moschitti, Alessandro and
Belinkov, Yonatan and Hsu, Wei-Ning and Zhang, Yu and
Mohtarami, Mitra and Glass, James",
booktitle = "Proceedings of {COLING} 2016, the 26th International
Conference on Computational Linguistics: Technical Papers",
month = dec,
year = 2016,
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://www.aclweb.org/anthology/C16-1163",
pages = "1734-1745",
abstract = "In real-world data, e.g., from Web forums, text is often
contaminated with redundant or irrelevant content, which
leads to introducing noise in machine learning algorithms. In
this paper, we apply Long Short-Term Memory networks with an
attention mechanism, which can select important parts of text
for the task of similar question retrieval from community
Question Answering (cQA) forums. In particular, we use the
attention weights for both selecting entire sentences and
their subparts, i.e., word/chunk, from shallow syntactic
trees. More interestingly, we apply tree kernels to the
filtered text representations, thus exploiting the implicit
features of the subtree space for learning question
reranking. Our results show that the attention-based pruning
allows for achieving the top position in the cQA challenge of
SemEval 2016, with a relatively large gap from the other
participants while greatly decreasing running time."
}
@inproceedings{nassif-2016-learning-semantic-relatedness,
title = "Learning Semantic Relatedness in Community Question Answering
Using Neural Models",
author = "Nassif, Henry and Mohtarami, Mitra and Glass, James",
booktitle = "Proceedings of the 1st Workshop on Representation Learning
for {NLP}",
month = aug,
year = 2016,
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W16-1616",
doi = "10.18653/v1/W16-1616",
pages = "137-147"
}
@inproceedings{nakov-2016-semeval-task,
title = "{S}em{E}val-2016 Task 3: Community Question Answering",
author = "Nakov, Preslav and M{\`a}rquez, Llu{\'\i}s and Moschitti,
Alessandro and Magdy, Walid and Mubarak, Hamdy and Freihat,
Abed Alhakim and Glass, Jim and Randeree, Bilal",
booktitle = "Proceedings of the 10th International Workshop on Semantic
Evaluation ({S}em{E}val-2016)",
month = jun,
year = 2016,
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/S16-1083",
doi = "10.18653/v1/S16-1083",
pages = "525-545"
}
@inproceedings{belinkov-2015-vectorslu,
title = "{V}ector{SLU}: A Continuous Word Vector Approach to Answer
Selection in Community Question Answering Systems",
author = "Belinkov, Yonatan and Mohtarami, Mitra and Cyphers, Scott and
Glass, James",
booktitle = "Proceedings of the 9th International Workshop on Semantic
Evaluation ({S}em{E}val 2015)",
month = jun,
year = 2015,
address = "Denver, Colorado",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/S15-2048",
doi = "10.18653/v1/S15-2048",
pages = "282-287"
}
@inproceedings{nakov-2015-semeval-task,
title = "{S}em{E}val-2015 Task 3: Answer Selection in Community
Question Answering",
author = "Nakov, Preslav and M{\`a}rquez, Llu{\'\i}s and Magdy, Walid
and Moschitti, Alessandro and Glass, Jim and Randeree, Bilal",
booktitle = "Proceedings of the 9th International Workshop on Semantic
Evaluation ({S}em{E}val 2015)",
month = jun,
year = 2015,
address = "Denver, Colorado",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/S15-2047",
doi = "10.18653/v1/S15-2047",
pages = "269-281"
}
@article{loshchilov-2017-adamw,
author = "Loshchilov, Ilya and Hutter, Frank",
title = "Decoupled Weight Decay Regularization",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1711.05101v3",
abstract = "L$_2$ regularization and weight decay regularization are
equivalent for standard stochastic gradient descent (when
rescaled by the learning rate), but as we demonstrate this is
\emph{not} the case for adaptive gradient algorithms, such as
Adam. While common implementations of these algorithms employ
L$_2$ regularization (often calling it ``weight decay`` in
what may be misleading due to the inequivalence we expose),
we propose a simple modification to recover the original
formulation of weight decay regularization by
\emph{decoupling} the weight decay from the optimization
steps taken w.r.t. the loss function. We provide empirical
evidence that our proposed modification (i) decouples the
optimal choice of weight decay factor from the setting of the
learning rate for both standard SGD and Adam and (ii)
substantially improves Adam's generalization performance,
allowing it to compete with SGD with momentum on image
classification datasets (on which it was previously typically
outperformed by the latter). Our proposed decoupled weight
decay has already been adopted by many researchers, and the
community has implemented it in TensorFlow and PyTorch; the
complete source code for our experiments is available at
https://github.com/loshchil/AdamW-and-SGDW",
archivePrefix= "arXiv",
eprint = "1711.05101",
primaryClass = "cs.LG"
}
@article{wang-2014-hashining-similarity-search,
author = "Wang, Jingdong and Shen, Heng Tao and Song, Jingkuan and Ji,
Jianqiu",
title = "Hashing for Similarity Search: a Survey",
journal = "CoRR",
year = 2014,
url = "http://arxiv.org/abs/1408.2927v1",
abstract = "Similarity search (nearest neighbor search) is a problem of
pursuing the data items whose distances to a query item are
the smallest from a large database. Various methods have
been developed to address this problem, and recently a lot of
efforts have been devoted to approximate search. In this
paper, we present a survey on one of the main solutions,
hashing, which has been widely studied since the pioneering
work locality sensitive hashing. We divide the hashing
algorithms two main categories: locality sensitive hashing,
which designs hash functions without exploring the data
distribution and learning to hash, which learns hash
functions according the data distribution, and review them
from various aspects, including hash function design and
distance measure and search scheme in the hash coding space.",
archivePrefix= "arXiv",
eprint = "1408.2927",
primaryClass = "cs.DS"
}
@inproceedings{yang-2008-ilp,
title = "An Entity-Mention Model for Coreference Resolution with
Inductive Logic Programming",
author = "Yang, Xiaofeng and Su, Jian and Lang, Jun and Tan, Chew Lim
and Liu, Ting and Li, Sheng",
booktitle = "Proceedings of ACL-08: HLT",
month = jun,
year = 2008,
address = "Columbus, Ohio",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P08-1096",
pages = "843-851"
}
@article{cui-2019-kbqa,
author = "Cui, Wanyun and Xiao, Yanghua and Wang, Haixun and Song,
Yangqiu and Hwang, Seung-won and Wang, Wei",
title = "Kbqa: Learning Question Answering Over Qa Corpora and
Knowledge Bases",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1903.02419v1",
abstract = "Question answering (QA) has become a popular way for humans
to access billion-scale knowledge bases. Unlike web search,
QA over a knowledge base gives out accurate and concise
results, provided that natural language questions can be
understood and mapped precisely to structured queries over
the knowledge base. The challenge, however, is that a human
can ask one question in many different ways. Previous
approaches have natural limits due to their representations:
rule based approaches only understand a small set of
``canned`` questions, while keyword based or synonym based
approaches cannot fully understand the questions. In this
paper, we design a new kind of question representation:
templates, over a billion scale knowledge base and a million
scale QA corpora. For example, for questions about a city's
population, we learn templates such as What's the population
of $city?, How many people are there in $city?. We learned 27
million templates for 2782 intents. Based on these templates,
our QA system KBQA effectively supports binary factoid
questions, as well as complex questions which are composed of
a series of binary factoid questions. Furthermore, we expand
predicates in RDF knowledge base, which boosts the coverage
of knowledge base by 57 times. Our QA system beats all other
state-of-art works on both effectiveness and efficiency over
QALD benchmarks.",
archivePrefix= "arXiv",
eprint = "1903.02419",
primaryClass = "cs.CL"
}
@article{bordes-2014-open-qa,
author = "Bordes, Antoine and Weston, Jason and Usunier, Nicolas",
title = "Open Question Answering With Weakly Supervised Embedding
Models",
journal = "CoRR",
year = 2014,
url = "http://arxiv.org/abs/1404.4326v1",
abstract = "Building computers able to answer questions on any subject is
a long standing goal of artificial intelligence. Promising
progress has recently been achieved by methods that learn to
map questions to logical forms or database queries. Such
approaches can be effective but at the cost of either large
amounts of human-labeled data or by defining lexicons and
grammars tailored by practitioners. In this paper, we instead
take the radical approach of learning to map questions to
vectorial feature representations. By mapping answers into
the same space one can query any knowledge base independent
of its schema, without requiring any grammar or lexicon. Our
method is trained with a new optimization procedure combining
stochastic gradient descent followed by a fine-tuning step
using the weak supervision provided by blending automatically
and collaboratively generated resources. We empirically
demonstrate that our model can capture meaningful signals
from its noisy supervision leading to major improvements over
paralex, the only existing method able to be trained on
similar weakly labeled data.",
archivePrefix= "arXiv",
eprint = "1404.4326",
primaryClass = "cs.CL"
}
@inproceedings{zhao-2011-auto-qg,
title = "Automatically Generating Questions from Queries for
Community-based Question Answering",
author = "Zhao, Shiqi and Wang, Haifeng and Li, Chao and Liu, Ting and
Guan, Yi",
booktitle = "Proceedings of 5th International Joint Conference on Natural
Language Processing",
month = nov,
year = 2011,
address = "Chiang Mai, Thailand",
publisher = "Asian Federation of Natural Language Processing",
url = "https://www.aclweb.org/anthology/I11-1104",
pages = "929-937"
}
@article{yuan-2017-neural-qg,
author = "Yuan, Xingdi and Wang, Tong and Gulcehre, Caglar and Sordoni,
Alessandro and Bachman, Philip and Subramanian, Sandeep and
Zhang, Saizheng and Trischler, Adam",
title = "Machine Comprehension By Text-To-Text Neural Question
Generation",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1705.02012v2",
abstract = "We propose a recurrent neural model that generates
natural-language questions from documents, conditioned on
answers. We show how to train the model using a combination
of supervised and reinforcement learning. After teacher
forcing for standard maximum likelihood training, we
fine-tune the model using policy gradient techniques to
maximize several rewards that measure question quality. Most
notably, one of these rewards is the performance of a
question-answering system. We motivate question generation as
a means to improve the performance of question answering
systems. Our model is trained and evaluated on the recent
question-answering dataset SQuAD.",
archivePrefix= "arXiv",
eprint = "1705.02012",
primaryClass = "cs.CL"
}
@article{subramanian-2017-neural-qg,
author = "Subramanian, Sandeep and Wang, Tong and Yuan, Xingdi and
Zhang, Saizheng and Bengio, Yoshua and Trischler, Adam",
title = "Neural Models for Key Phrase Detection and Question
Generation",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1706.04560v3",
abstract = "We propose a two-stage neural model to tackle question
generation from documents. First, our model estimates the
probability that word sequences in a document are ones that a
human would pick when selecting candidate answers by training
a neural key-phrase extractor on the answers in a
question-answering corpus. Predicted key phrases then act as
target answers and condition a sequence-to-sequence
question-generation model with a copy mechanism.
Empirically, our key-phrase extraction model significantly
outperforms an entity-tagging baseline and existing
rule-based approaches. We further demonstrate that our
question generation system formulates fluent, answerable
questions from key phrases. This two-stage system could be
used to augment or generate reading comprehension datasets,
which may be leveraged to improve machine reading systems or
in educational settings.",
archivePrefix= "arXiv",
eprint = "1706.04560",
primaryClass = "cs.CL"
}
@inproceedings{rao-2019-gan-qg,
title = "{A}nswer-based {A}dversarial {T}raining for {G}enerating
{C}larification {Q}uestions",
author = "Rao, Sudha and Daum{\'e} III, Hal",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long and Short
Papers)",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N19-1013",
doi = "10.18653/v1/N19-1013",
pages = "143-155",
abstract = "We present an approach for generating clarification questions
with the goal of eliciting new information that would make
the given textual context more complete. We propose that
modeling hypothetical answers (to clarification questions) as
latent variables can guide our approach into generating more
useful clarification questions. We develop a Generative
Adversarial Network (GAN) where the generator is a
sequence-to-sequence model and the discriminator is a utility
function that models the value of updating the context with
the answer to the clarification question. We evaluate on two
datasets, using both automatic metrics and human judgments of
usefulness, specificity and relevance, showing that our
approach outperforms both a retrieval-based model and
ablations that exclude the utility model and the adversarial
training."
}
@inproceedings{heilman-2010-good-question,
title = "Good question! statistical ranking for question generation",
author = "Heilman, Michael and Smith, Noah A",
booktitle = "Human Language Technologies: The 2010 Annual Conference of
the North American Chapter of the Association for
Computational Linguistics",
pages = "609-617",
year = 2010,
organization = "Association for Computational Linguistics"
}
@article{tang-2017-qa-qg-dual-task,
author = "Tang, Duyu and Duan, Nan and Qin, Tao and Yan, Zhao and Zhou,
Ming",
title = "Question Answering and Question Generation As Dual Tasks",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1706.02027v2",
abstract = "We study the problem of joint question answering (QA) and
question generation (QG) in this paper. Our intuition is
that QA and QG have intrinsic connections and these two tasks
could improve each other. On one side, the QA model judges
whether the generated question of a QG model is relevant to
the answer. On the other side, the QG model provides the
probability of generating a question given the answer, which
is a useful evidence that in turn facilitates QA. In this
paper we regard QA and QG as dual tasks. We propose a
training framework that trains the models of QA and QG
simultaneously, and explicitly leverages their probabilistic
correlation to guide the training process of both models. We
implement a QG model based on sequence-to-sequence learning,
and a QA model based on recurrent neural network. As all the
components of the QA and QG models are differentiable, all
the parameters involved in these two models could be
conventionally learned with back propagation. We conduct
experiments on three datasets. Empirical results show that
our training framework improves both QA and QG tasks. The
improved QA model performs comparably with strong baseline
approaches on all three datasets.",
archivePrefix= "arXiv",
eprint = "1706.02027",
primaryClass = "cs.CL"
}
@article{wang-2017-joint-qa-qg,
author = "Wang, Tong and Yuan, Xingdi and Trischler, Adam",
title = "A Joint Model for Question Answering and Question Generation",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1706.01450v1",
abstract = "We propose a generative machine comprehension model that
learns jointly to ask and answer questions based on
documents. The proposed model uses a sequence-to-sequence
framework that encodes the document and generates a question
(answer) given an answer (question). Significant improvement
in model performance is observed empirically on the SQuAD
corpus, confirming our hypothesis that the model benefits
from jointly learning to perform both tasks. We believe the
joint model's novelty offers a new perspective on machine
comprehension beyond architectural engineering, and serves as
a first step towards autonomous information seeking.",
archivePrefix= "arXiv",
eprint = "1706.01450",
primaryClass = "cs.CL"
}
@article{yang-2017-qa-dan,
author = "Yang, Zhilin and Hu, Junjie and Salakhutdinov, Ruslan and
Cohen, William W.",
title = "Semi-Supervised Qa With Generative Domain-Adaptive Nets",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1702.02206v2",
abstract = "We study the problem of semi-supervised question
answering----utilizing unlabeled text to boost the
performance of question answering models. We propose a novel
training framework, the Generative Domain-Adaptive Nets. In
this framework, we train a generative model to generate
questions based on the unlabeled text, and combine
model-generated questions with human-generated questions for
training question answering models. We develop novel domain
adaptation algorithms, based on reinforcement learning, to
alleviate the discrepancy between the model-generated data
distribution and the human-generated data
distribution. Experiments show that our proposed framework
obtains substantial improvement from unlabeled text.",
archivePrefix= "arXiv",
eprint = "1702.02206",
primaryClass = "cs.CL"
}
@article{jegou-2011-product-quantization,
author = "Jegou, Herve and Douze, Matthijs and Schmid, Cordelia",
title = "Product Quantization for Nearest Neighbor Search",
year = 2011,
issue_date = "January 2011",
publisher = "IEEE Computer Society",
address = "USA",
volume = 33,
number = 1,
issn = "0162-8828",
url = "https://doi.org/10.1109/TPAMI.2010.57",
doi = "10.1109/TPAMI.2010.57",
journal = "IEEE Trans. Pattern Anal. Mach. Intell.",
month = jan,
pages = "117–128",
numpages = 12,
keywords = "High-dimensional indexing, image indexing, very large
databases, High-dimensional indexing, image indexing, very
large databases, approximate search., approximate search."
}
@article{tay-2018-csran,
author = "Tay, Yi and Tuan, Luu Anh and Hui, Siu Cheung",
title = "Co-Stack Residual Affinity Networks With Multi-Level
Attention Refinement for Matching Text Sequences",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1810.02938v1",
abstract = "Learning a matching function between two text sequences is a
long standing problem in NLP research. This task enables many
potential applications such as question answering and
paraphrase identification. This paper proposes Co-Stack
Residual Affinity Networks (CSRAN), a new and universal
neural architecture for this problem. CSRAN is a deep
architecture, involving stacked (multi-layered) recurrent
encoders. Stacked/Deep architectures are traditionally
difficult to train, due to the inherent weaknesses such as
difficulty with feature propagation and vanishing
gradients. CSRAN incorporates two novel components to take
advantage of the stacked architecture. Firstly, it introduces
a new bidirectional alignment mechanism that learns affinity
weights by fusing sequence pairs across stacked
hierarchies. Secondly, it leverages a multi-level attention
refinement component between stacked recurrent layers. The
key intuition is that, by leveraging information across all
network hierarchies, we can not only improve gradient flow
but also improve overall performance. We conduct extensive
experiments on six well-studied text sequence matching
datasets, achieving state-of-the-art performance on all.",
archivePrefix= "arXiv",
eprint = "1810.02938",
primaryClass = "cs.CL"
}
@inproceedings{he-2016-pairwise-word-interaction,
title = "Pairwise Word Interaction Modeling with Deep Neural Networks
for Semantic Similarity Measurement",
author = "He, Hua and Lin, Jimmy",
booktitle = "Proceedings of the 2016 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies",
month = jun,
year = 2016,
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N16-1108",
doi = "10.18653/v1/N16-1108",
pages = "937-948"
}
@article{zhang-2020-soft-masked-bert,
author = "Zhang, Shaohua and Huang, Haoran and Liu, Jicong and Li,
Hang",
title = "Spelling Error Correction With Soft-Masked Bert",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2005.07421v1",
abstract = "Spelling error correction is an important yet challenging
task because a satisfactory solution of it essentially needs
human-level language understanding ability. Without loss of
generality we consider Chinese spelling error correction
(CSC) in this paper. A state-of-the-art method for the task
selects a character from a list of candidates for correction
(including non-correction) at each position of the sentence
on the basis of BERT, the language representation model. The
accuracy of the method can be sub-optimal, however, because
BERT does not have sufficient capability to detect whether
there is an error at each position, apparently due to the way
of pre-training it using mask language modeling. In this
work, we propose a novel neural architecture to address the
aforementioned issue, which consists of a network for error
detection and a network for error correction based on BERT,
with the former being connected to the latter with what we
call soft-masking technique. Our method of using
`Soft-Masked BERT' is general, and it may be employed in
other language detection-correction problems. Experimental
results on two datasets demonstrate that the performance of
our proposed method is significantly better than the
baselines including the one solely based on BERT.",
archivePrefix= "arXiv",
eprint = "2005.07421",
primaryClass = "cs.CL"
}
@inproceedings{sarikaya-2016-cortana,
title = "An overview of end-to-end language understanding and dialog
management for personal digital assistants",
author = "Sarikaya, Ruhi and Crook, Paul A and Marin, Alex and Jeong,
Minwoo and Robichaud, Jean-Philippe and Celikyilmaz, Asli and
Kim, Young-Bum and Rochette, Alexandre and Khan, Omar Zia and
Liu, Xiaohu and others",
booktitle = "2016 ieee spoken language technology workshop (slt)",
pages = "391-397",
year = 2016,
organization = "IEEE"
}
@article{williams-2017-hcn,
author = "Williams, Jason D. and Asadi, Kavosh and Zweig, Geoffrey",
title = "Hybrid Code Networks: Practical and Efficient End-To-End
Dialog Control With Supervised and Reinforcement Learning",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1702.03274v2",
abstract = "End-to-end learning of recurrent neural networks (RNNs) is an
attractive solution for dialog systems; however, current
techniques are data-intensive and require thousands of
dialogs to learn simple behaviors. We introduce Hybrid Code
Networks (HCNs), which combine an RNN with domain-specific
knowledge encoded as software and system action
templates. Compared to existing end-to-end approaches, HCNs
considerably reduce the amount of training data required,
while retaining the key benefit of inferring a latent
representation of dialog state. In addition, HCNs can be
optimized with supervised learning, reinforcement learning,
or a mixture of both. HCNs attain state-of-the-art
performance on the bAbI dialog dataset, and outperform two
commercially deployed customer-facing dialog systems.",
archivePrefix= "arXiv",
eprint = "1702.03274",
primaryClass = "cs.AI"
}
@article{anh-2017-hybrid-bi-lstm-crf,
author = "Anh, L. T. and Arkhipov, M. Y. and Burtsev, M. S.",
title = "Application of a Hybrid Bi-Lstm-Crf Model To the Task of
Russian Named Entity Recognition",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1709.09686v2",
abstract = "Named Entity Recognition (NER) is one of the most common
tasks of the natural language processing. The purpose of NER
is to find and classify tokens in text documents into
predefined categories called tags, such as person names,
quantity expressions, percentage expressions, names of
locations, organizations, as well as expression of time,
currency and others. Although there is a number of approaches
have been proposed for this task in Russian language, it
still has a substantial potential for the better
solutions. In this work, we studied several deep neural
network models starting from vanilla Bi-directional Long
Short-Term Memory (Bi-LSTM) then supplementing it with
Conditional Random Fields (CRF) as well as highway networks
and finally adding external word embeddings. All models were
evaluated across three datasets: Gareev's dataset,
Person-1000, FactRuEval-2016. We found that extension of
Bi-LSTM model with CRF significantly increased the quality of
predictions. Encoding input tokens with external word
embeddings reduced training time and allowed to achieve state
of the art for the Russian NER task.",
archivePrefix= "arXiv",
eprint = "1709.09686",
primaryClass = "cs.CL"
}
@article{le-2019-deep-neural,
author = "Lê, Thế Anh",
year = 2019,
month = 02,
title = "A Deep Neural Network Model for the task of Named Entity
Recognition",
volume = 9,
journal = "International Journal of Machine Learning and Computing",
doi = "10.18178/ijmlc.2019.9.1.758"
}
@inproceedings{le-2020-sla-to-sbd,
author = "Le, The Anh",
title = "Sequence Labeling Approach to the Task of Sentence Boundary
Detection",
year = 2020,
isbn = 9781450376310,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/3380688.3380703",
doi = "10.1145/3380688.3380703",
booktitle = "Proceedings of the 4th International Conference on Machine
Learning and Soft Computing",
pages = "144–148",
numpages = 5,
keywords = "voice-enabled chatbot, sequence labeling, Sentence boundary
detection",
location = "Haiphong City, Viet Nam",
series = "ICMLSC 2020"
}
@article{gao-2018-neural-conversation,
author = "Gao, Jianfeng and Galley, Michel and Li, Lihong",
title = "Neural Approaches To Conversational Ai",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1809.08267v3",
abstract = "The present paper surveys neural approaches to conversational
AI that have been developed in the last few years. We group
conversational systems into three categories: (1) question
answering agents, (2) task-oriented dialogue agents, and (3)
chatbots. For each category, we present a review of
state-of-the-art neural approaches, draw the connection
between them and traditional approaches, and discuss the
progress that has been made and challenges still being faced,
using specific systems and models as case studies.",
archivePrefix= "arXiv",
eprint = "1809.08267",
primaryClass = "cs.CL"
}
@inproceedings{kurata-2016-sentence-level-slot-filling,
title = "Leveraging Sentence-level Information with Encoder {LSTM} for
Semantic Slot Filling",
author = "Kurata, Gakuto and Xiang, Bing and Zhou, Bowen and Yu, Mo",
booktitle = "Proceedings of the 2016 Conference on Empirical Methods in
Natural Language Processing",
month = nov,
year = 2016,
address = "Austin, Texas",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D16-1223",
doi = "10.18653/v1/D16-1223",
pages = "2077-2083"
}
@article{jaech-2016-da-for-nlu,
author = "Jaech, Aaron and Heck, Larry and Ostendorf, Mari",
title = "Domain Adaptation of Recurrent Neural Networks for Natural
Language Understanding",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1604.00117v2",
abstract = "The goal of this paper is to use multi-task learning to
efficiently scale slot filling models for natural language
understanding to handle multiple target tasks or domains. The
key to scalability is reducing the amount of training data
needed to learn a model for a new task. The proposed
multi-task model delivers better performance with less data
by leveraging patterns that it learns from the other
tasks. The approach supports an open vocabulary, which allows
the models to generalize to unseen words, which is
particularly important when very little training data is
used. A newly collected crowd-sourced data set, covering four
different domains, is used to demonstrate the effectiveness
of the domain adaptation and open vocabulary techniques.",
archivePrefix= "arXiv",
eprint = "1604.00117",
primaryClass = "cs.CL"
}
@inproceedings{tafforeau-2016-multitask-slu,
title = "Joint Syntactic and Semantic Analysis with a Multitask Deep
Learning Framework for Spoken Language Understanding",
author = "J{\'e}r{\'e}mie Tafforeau and Fr{\'e}d{\'e}ric B{\'e}chet and
Thierry Arti{\`e}res and Beno{\^i}t Favre",
booktitle = "INTERSPEECH",
year = 2016
}
@InProceedings{hakkani-tur-2016-joint-semantic-frame-parsing,
author = "Hakkani-Tür, Dilek and Tur, Gokhan and Celikyilmaz, Asli and
Chen, Yun-Nung Vivian and Gao, Jianfeng and Deng, Li and
Wang, Ye-Yi",
title = "Multi-Domain Joint Semantic Frame Parsing using
Bi-directional RNN-LSTM",
booktitle = "Proceedings of The 17th Annual Meeting of the International
Speech Communication Association (INTERSPEECH 2016)",
year = 2016,
month = "June",
abstract = "Sequence-to-sequence deep learning has recently emerged as a
new paradigm in supervised learning for spoken language
understanding. However, most of the previous studies explored
this framework for building single domain models for each
task, such as slot filling or domain classification,
comparing deep learning based approaches with conventional
ones like conditional random fields. This paper proposes a
holistic multi-domain, multi-task (i.e. slot filling, domain
and intent detection) modeling approach to estimate complete
semantic frames for all user utterances addressed to a
conversational system, demonstrating the distinctive power of
deep learning methods, namely bi-directional recurrent neural
network (RNN) with long-short term memory (LSTM) cells
(RNN-LSTM) to handle such complexity. The contributions of
the presented work are three-fold: (i) we propose an RNN-LSTM
architecture for joint modeling of slot filling, intent
determination, and domain classification; (ii) we build a
joint multi-domain model enabling multi-task deep learning
where the data from each domain reinforces each other; (iii)
we investigate alternative architectures for modeling lexical
context in spoken language understanding. In addition to the
simplicity of the single model framework, experimental
results show the power of such an approach on Microsoft
Cortana real user data over alternative methods based on
single domain/task deep learning.",
publisher = "ISCA",
url =
"https://www.microsoft.com/en-us/research/publication/multijoint/",
edition = "Proceedings of The 17th Annual Meeting of the International
Speech Communication Association (INTERSPEECH 2016)"
}
@article{liu-2016-joint-intent-detection-slot-filling,
author = "Liu, Bing and Lane, Ian",
title = "Attention-Based Recurrent Neural Network Models for Joint
Intent Detection and Slot Filling",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1609.01454v1",
abstract = "Attention-based encoder-decoder neural network models have
recently shown promising results in machine translation and
speech recognition. In this work, we propose an
attention-based neural network model for joint intent
detection and slot filling, both of which are critical steps
for many speech understanding and dialog systems. Unlike in
machine translation and speech recognition, alignment is
explicit in slot filling. We explore different strategies in
incorporating this alignment information to the
encoder-decoder framework. Learning from the attention
mechanism in encoder-decoder model, we further propose
introducing attention to the alignment-based RNN models. Such
attentions provide additional information to the intent
classification and slot label prediction. Our independent
task models achieve state-of-the-art intent detection error
rate and slot filling F1 score on the benchmark ATIS
task. Our joint training model further obtains 0.56 \%
absolute (23.8 \% relative) error reduction on intent
detection and 0.23 \% absolute gain on slot filling over the
independent task models.",
archivePrefix= "arXiv",
eprint = "1609.01454",
primaryClass = "cs.CL"
}
@inproceedings{hori-2016-contextual-slu,
title = "Context-Sensitive and Role-Dependent Spoken Language
Understanding Using Bidirectional and Attention LSTMs",
author = "Chiori Hori and Takaaki Hori and Shinji Watanabe and John
R. Hershey",
booktitle = "INTERSPEECH",
year = 2016
}
@INPROCEEDINGS{bhargava-2013-easy-slot-detection,
author = "A. {Bhargava} and A. {Celikyilmaz} and D. {Hakkani-Tür} and
R. {Sarikaya}",
booktitle = "2013 IEEE International Conference on Acoustics, Speech and
Signal Processing",
title = "Easy contextual intent prediction and slot detection",
year = 2013,
pages = "8337-8341"
}
@InProceedings{chen-2016-mm-for-slu,
author = "Chen, Yun-Nung Vivian and Hakkani-Tür, Dilek and Tur, Gokhan
and Gao, Jianfeng and Deng, Li",
title = "End-to-End Memory Networks with Knowledge Carryover for
Multi-Turn Spoken Language Understanding",
booktitle = "Proceedings of The 17th Annual Meeting of the International
Speech Communication Association (INTERSPEECH 2016)",
year = 2016,
month = "June",
abstract = "Spoken language understanding (SLU) is a core component of a
spoken dialogue system. In the traditional architecture of
dialogue systems, the SLU component treats each utterance
independent of each other, and then the following components
aggregate the multi-turn information in the separate
phases. However, there are two challenges: 1) errors from
previous turns may be propagated and then degrade the
performance of the current turn; 2) knowledge mentioned in
the long history may not be carried into the current
turn. This paper addresses the above issues by proposing an
architecture using end-to-end memory networks to model
knowledge carryover in multi-turn conversations, where
utterances encoded with intents and slots can be stored as
embeddings in the memory and the decoding phase applies an
attention model to leverage previously stored semantics for
intent prediction and slot tagging simultaneously. The
experiments on Microsoft Cortana conversational data show
that the proposed memory network architecture can effectively
extract salient semantics for modeling knowledge carryover in
the multi-turn conversations and outperform the results using
the state-of-the-art recurrent neural network framework (RNN)
designed for single-turn SLU.",
publisher = "ISCA",
url =
"https://www.microsoft.com/en-us/research/publication/contextualslu/",
edition = "Proceedings of The 17th Annual Meeting of the International
Speech Communication Association (INTERSPEECH 2016)"
}
@inproceedings{bapna-2017-sequential-dialogue,
title = "Sequential Dialogue Context Modeling for Spoken Language
Understanding",
author = "Bapna, Ankur and T{\"u}r, Gokhan and Hakkani-T{\"u}r, Dilek
and Heck, Larry",
booktitle = "Proceedings of the 18th Annual {SIG}dial Meeting on Discourse
and Dialogue",
month = aug,
year = 2017,
address = "Saarbr{\"u}cken, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-5514",
doi = "10.18653/v1/W17-5514",
pages = "103-114",
abstract = "Spoken Language Understanding (SLU) is a key component of
goal oriented dialogue systems that would parse user
utterances into semantic frame representations. Traditionally
SLU does not utilize the dialogue history beyond the previous
system turn and contextual ambiguities are resolved by the
downstream components. In this paper, we explore novel
approaches for modeling dialogue context in a recurrent
neural network (RNN) based language understanding system. We
propose the Sequential Dialogue Encoder Network, that allows
encoding context from the dialogue history in chronological
order. We compare the performance of our proposed
architecture with two context models, one that uses just the
previous turn context and another that encodes dialogue
context in a memory network, but loses the order of
utterances in the dialogue history. Experiments with a
multi-domain dialogue dataset demonstrate that the proposed
architecture results in reduced semantic frame error rates."
}
@article{chen-2016-k-san,
author = "Chen, Yun-Nung and Hakkani-Tur, Dilek and Tur, Gokhan and
Celikyilmaz, Asli and Gao, Jianfeng and Deng, Li",
title = "Knowledge As a Teacher: Knowledge-Guided Structural Attention
Networks",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1609.03286v1",
abstract = "Natural language understanding (NLU) is a core component of a
spoken dialogue system. Recently recurrent neural networks
(RNN) obtained strong results on NLU due to their superior
ability of preserving sequential information over time.
Traditionally, the NLU module tags semantic slots for
utterances considering their flat structures, as the
underlying RNN structure is a linear chain. However, natural
language exhibits linguistic properties that provide rich,
structured information for better understanding. This paper
introduces a novel model, knowledge-guided structural
attention networks (K-SAN), a generalization of RNN to
additionally incorporate non-flat network topologies guided
by prior knowledge. There are two characteristics: 1)
important substructures can be captured from small training
data, allowing the model to generalize to previously unseen
test data; 2) the model automatically figures out the salient
substructures that are essential to predict the semantic tags
of the given sentences, so that the understanding performance
can be improved. The experiments on the benchmark Air Travel
Information System (ATIS) data show that the proposed K-SAN
architecture can effectively extract salient knowledge from
substructures with an attention mechanism, and outperform the
performance of the state-of-the-art neural network based
frameworks.",
archivePrefix= "arXiv",
eprint = "1609.03286",
primaryClass = "cs.AI"
}
@article{li-2017-lu-importance,
author = "Li, Xiujun and Chen, Yun-Nung and Li, Lihong and Gao,
Jianfeng and Celikyilmaz, Asli",
title = "Investigation of Language Understanding Impact for
Reinforcement Learning Based Dialogue Systems",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1703.07055v1",
abstract = "Language understanding is a key component in a spoken
dialogue system. In this paper, we investigate how the
language understanding module influences the dialogue system
performance by conducting a series of systematic experiments
on a task-oriented neural dialogue system in a reinforcement
learning based setting. The empirical study shows that among
different types of language understanding errors, slot-level
errors can have more impact on the overall performance of a
dialogue system compared to intent-level errors. In addition,
our experiments demonstrate that the reinforcement learning
based dialogue system is able to learn when and what to
confirm in order to achieve better performance and greater
robustness.",
archivePrefix= "arXiv",
eprint = "1703.07055",
primaryClass = "cs.CL"
}
@inproceedings{henderson-2013-nn-for-dst,
title = "Deep Neural Network Approach for the Dialog State Tracking
Challenge",
author = "Henderson, Matthew and Thomson, Blaise and Young, Steve",
booktitle = "Proceedings of the {SIGDIAL} 2013 Conference",
month = aug,
year = 2013,
address = "Metz, France",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W13-4073",
pages = "467-471"
}
@inproceedings{henderson-2015-ml-for-dst,
title = "Machine Learning for Dialog State Tracking: A Review",
author = "Matthew Henderson",
year = 2015,
booktitle = "Proceedings of The First International Workshop on Machine
Learning in Spoken Language Processing"
}
@article{mrksic-2015-rnn-for-dst,
author = "Mrk{\v{s}}i{\'c}, Nikola and S{\'e}aghdha, Diarmuid {\'O} and
Thomson, Blaise and Ga{\v{s}}i{\'c}, Milica and Su, Pei-Hao
and Vandyke, David and Wen, Tsung-Hsien and Young, Steve",
title = "Multi-Domain Dialog State Tracking Using Recurrent Neural
Networks",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1506.07190v1",
abstract = "Dialog state tracking is a key component of many modern
dialog systems, most of which are designed with a single,
well-defined domain in mind. This paper shows that dialog
data drawn from different dialog domains can be used to train
a general belief tracking model which can operate across all
of these domains, exhibiting superior performance to each of
the domain-specific models. We propose a training procedure
which uses out-of-domain data to initialise belief tracking
models for entirely new domains. This procedure leads to
improvements in belief tracking performance regardless of the
amount of in-domain data available for training the model.",
archivePrefix= "arXiv",
eprint = "1506.07190",
primaryClass = "cs.CL"
}
@article{mrksic-2016-neural-belief-tracker,
author = "Mrk{\v{s}}i{\'c}, Nikola and S{\'e}aghdha, Diarmuid {\'O} and
Wen, Tsung-Hsien and Thomson, Blaise and Young, Steve",
title = "Neural Belief Tracker: Data-Driven Dialogue State Tracking",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1606.03777v2",
abstract = "One of the core components of modern spoken dialogue systems
is the belief tracker, which estimates the user's goal at
every step of the dialogue. However, most current approaches
have difficulty scaling to larger, more complex dialogue
domains. This is due to their dependency on either: a) Spoken
Language Understanding models that require large amounts of
annotated training data; or b) hand-crafted lexicons for
capturing some of the linguistic variation in users'
language. We propose a novel Neural Belief Tracking (NBT)
framework which overcomes these problems by building on
recent advances in representation learning. NBT models reason
over pre-trained word vectors, learning to compose them into
distributed representations of user utterances and dialogue
context. Our evaluation on two datasets shows that this
approach surpasses past limitations, matching the performance
of state-of-the-art models which rely on hand-crafted
semantic lexicons and outperforming them when such lexicons
are not provided.",
archivePrefix= "arXiv",
eprint = "1606.03777",
primaryClass = "cs.CL"
}
@article{shi-2017-cnn-for-dst,
author = "Shi, Hongjie and Ushio, Takashi and Endo, Mitsuru and
Yamagami, Katsuyoshi and Horii, Noriaki",
title = "A Multichannel Convolutional Neural Network for
Cross-Language Dialog State Tracking",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1701.06247v1",
abstract = "The fifth Dialog State Tracking Challenge (DSTC5) introduces
a new cross-language dialog state tracking scenario, where
the participants are asked to build their trackers based on
the English training corpus, while evaluating them with the
unlabeled Chinese corpus. Although the computer-generated
translations for both English and Chinese corpus are provided
in the dataset, these translations contain errors and
careless use of them can easily hurt the performance of the
built trackers. To address this problem, we propose a
multichannel Convolutional Neural Networks (CNN)
architecture, in which we treat English and Chinese language
as different input channels of one single CNN model. In the
evaluation of DSTC5, we found that such multichannel
architecture can effectively improve the robustness against
translation errors. Additionally, our method for DSTC5 is
purely machine learning based and requires no prior knowledge
about the target language. We consider this a desirable
property for building a tracker in the cross-language
context, as not every developer will be familiar with both
languages.",
archivePrefix= "arXiv",
eprint = "1701.06247",
primaryClass = "cs.CL"
}
@article{nguyen-2017-kbc-overview,
author = "Nguyen, Dat Quoc",
title = "An Overview of Embedding Models of Entities and Relationships
for Knowledge Base Completion",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1703.08098v7",
abstract = "Knowledge bases (KBs) of real-world facts about entities and
their relationships are useful resources for a variety of
natural language processing tasks. However, because knowledge
bases are typically incomplete, it is useful to be able to
perform knowledge base completion or link prediction, i.e.,
predict whether a relationship not in the knowledge base is
likely to be true. This paper serves as a comprehensive
overview of embedding models of entities and relationships
for knowledge base completion, summarizing up-to-date
experimental results on standard benchmark datasets.",
archivePrefix= "arXiv",
eprint = "1703.08098",
primaryClass = "cs.CL"
}
@article{li-2019-bertsel,
author = "Li, Dongfang and Yu, Yifei and Chen, Qingcai and Li, Xinyu",
title = "Bertsel: Answer Selection With Pre-Trained Models",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1905.07588v1",
abstract = "Recently, pre-trained models have been the dominant paradigm
in natural language processing. They achieved remarkable
state-of-the-art performance across a wide range of related
tasks, such as textual entailment, natural language
inference, question answering, etc. BERT, proposed by Devlin
et.al., has achieved a better marked result in GLUE
leaderboard with a deep transformer architecture. Despite its
soaring popularity, however, BERT has not yet been applied to
answer selection. This task is different from others with a
few nuances: first, modeling the relevance and correctness of
candidates matters compared to semantic relatedness and
syntactic structure; second, the length of an answer may be
different from other candidates and questions. In this paper.
we are the first to explore the performance of fine-tuning
BERT for answer selection. We achieved STOA results across
five popular datasets, demonstrating the success of
pre-trained models in this task.",
archivePrefix= "arXiv",
eprint = "1905.07588",
primaryClass = "cs.CL"
}
@inproceedings{lai-2019-gsamn,
title = "A Gated Self-attention Memory Network for Answer Selection",
author = "Lai, Tuan and Tran, Quan Hung and Bui, Trung and Kihara,
Daisuke",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = 2019,
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D19-1610",
doi = "10.18653/v1/D19-1610",
pages = "5953-5959",
abstract = "Answer selection is an important research problem, with
applications in many areas. Previous deep learning based
approaches for the task mainly adopt the Compare-Aggregate
architecture that performs word-level comparison followed by
aggregation. In this work, we take a departure from the
popular Compare-Aggregate architecture, and instead, propose
a new gated self-attention memory network for the
task. Combined with a simple transfer learning technique from
a large-scale online corpus, our model outperforms previous
methods by a large margin, achieving new state-of-the-art
results on two standard answer selection datasets: TrecQA and
WikiQA."
}
@article{mozafari-2019-bas,
author = "Mozafari, Jamshid and Fatemi, Afsaneh and Nematbakhsh,
Mohammad Ali",
title = "Bas: an Answer Selection Method Using Bert Language Model",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1911.01528v3",
abstract = "In recent years, Question Answering systems have become more
popular and widely used by users. Despite the increasing
popularity of these systems, the their performance is not
even sufficient for textual data and requires further
research. These systems consist of several parts that one of
them is the Answer Selection component. This component
detects the most relevant answer from a list of candidate
answers. The methods presented in previous researches have
attempted to provide an independent model to undertake the
answer-selection task. An independent model cannot comprehend
the syntactic and semantic features of questions and answers
with a small training dataset. To fill this gap, language
models can be employed in implementing the answer selection
part. This action enables the model to have a better
understanding of the language in order to understand
questions and answers better than previous works. In this
research, we will present the `` BAS`` (BERT Answer
Selection) that uses the BERT language model to comprehend
language. The empirical results of applying the model on the
TrecQA Raw, TrecQA Clean, and WikiQA datasets demonstrate
that using a robust language model such as BERT can enhance
the performance. Using a more robust classifier also enhances
the effect of the language model on the answer selection
component. The results demonstrate that language
comprehension is an essential requirement in natural language
processing tasks such as answer-selection.",
archivePrefix= "arXiv",
eprint = "1911.01528",
primaryClass = "cs.CL"
}
@article{sun-2019-dream,
author = "Sun, Kai and Yu, Dian and Chen, Jianshu and Yu, Dong and
Choi, Yejin and Cardie, Claire",
title = "Dream: a Challenge Dataset and Models for Dialogue-Based
Reading Comprehension",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1902.00164v1",
abstract = "We present DREAM, the first dialogue-based multiple-choice
reading comprehension dataset. Collected from
English-as-a-foreign-language examinations designed by human
experts to evaluate the comprehension level of Chinese
learners of English, our dataset contains 10,197
multiple-choice questions for 6,444 dialogues. In contrast to
existing reading comprehension datasets, DREAM is the first
to focus on in-depth multi-turn multi-party dialogue
understanding. DREAM is likely to present significant
challenges for existing reading comprehension systems: 84 \%
of answers are non-extractive, 85 \% of questions require
reasoning beyond a single sentence, and 34 \% of questions
also involve commonsense knowledge. We apply several popular
neural reading comprehension models that primarily exploit
surface information within the text and find them to, at
best, just barely outperform a rule-based approach. We next
investigate the effects of incorporating dialogue structure
and different kinds of general world knowledge into both
rule-based and (neural and non-neural) machine learning-based
reading comprehension models. Experimental results on the
DREAM dataset show the effectiveness of dialogue structure
and general world knowledge. DREAM will be available at
https://dataset.org/dream/.",
archivePrefix= "arXiv",
eprint = "1902.00164",
primaryClass = "cs.CL"
}
@article{sun-2019-c3,
author = "Sun, Kai and Yu, Dian and Yu, Dong and Cardie, Claire",
title = "Investigating Prior Knowledge for Challenging Chinese Machine
Reading Comprehension",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1904.09679v3",
abstract = "Machine reading comprehension tasks require a machine reader
to answer questions relevant to the given document. In this
paper, we present the first free-form multiple-Choice Chinese
machine reading Comprehension dataset (C^3), containing
13,369 documents (dialogues or more formally written
mixed-genre texts) and their associated 19,577
multiple-choice free-form questions collected from
Chinese-as-a-second-language examinations. We present a
comprehensive analysis of the prior knowledge (i.e.,
linguistic, domain-specific, and general world knowledge)
needed for these real-world problems. We implement rule-based
and popular neural methods and find that there is still a
significant performance gap between the best performing model
(68.5 \%) and human readers (96.0 \%), especially on problems
that require prior knowledge. We further study the effects of
distractor plausibility and data augmentation based on
translated relevant datasets for English on model
performance. We expect C^3 to present great challenges to
existing systems as answering 86.8 \% of questions requires
both knowledge within and beyond the accompanying document,
and we hope that C^3 can serve as a platform to study how to
leverage various kinds of prior knowledge to better
understand a given written or orally oriented text. C^3 is
available at https://dataset.org/c3/.",
archivePrefix= "arXiv",
eprint = "1904.09679",
primaryClass = "cs.CL"
}
@article{yu-2020-dialogre,
author = "Yu, Dian and Sun, Kai and Cardie, Claire and Yu, Dong",
title = "Dialogue-Based Relation Extraction",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2004.08056v1",
abstract = "We present the first human-annotated dialogue-based relation
extraction (RE) dataset DialogRE, aiming to support the
prediction of relation(s) between two arguments that appear
in a dialogue. We further offer DialogRE as a platform for
studying cross-sentence RE as most facts span multiple
sentences. We argue that speaker-related information plays a
critical role in the proposed task, based on an analysis of
similarities and differences between dialogue-based and
traditional RE tasks. Considering the timeliness of
communication in a dialogue, we design a new metric to
evaluate the performance of RE methods in a conversational
setting and investigate the performance of several
representative RE methods on DialogRE. Experimental results
demonstrate that a speaker-aware extension on the
best-performing model leads to gains in both the standard and
conversational evaluation settings. DialogRE is available at
https://dataset.org/dialogre/.",
archivePrefix= "arXiv",
eprint = "2004.08056",
primaryClass = "cs.CL"
}
@inproceedings{ratner-2018-snorkl-metal,
title = "Snorkel metal: Weak supervision for multi-task learning",
author = "Ratner, Alex and Hancock, Braden and Dunnmon, Jared and
Goldman, Roger and R{\'e}, Christopher",
booktitle = "Proceedings of the Second Workshop on Data Management for
End-To-End Machine Learning",
pages = "1-4",
year = 2018
}
@article{ratner-2018-snorkl-metal-1,
author = "Ratner, Alexander and Hancock, Braden and Dunnmon, Jared and
Sala, Frederic and Pandey, Shreyash and R{\'e}, Christopher",
title = "Training Complex Models With Multi-Task Weak Supervision",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1810.02840v2",
abstract = "As machine learning models continue to increase in
complexity, collecting large hand-labeled training sets has
become one of the biggest roadblocks in practice. Instead,
weaker forms of supervision that provide noisier but cheaper
labels are often used. However, these weak supervision
sources have diverse and unknown accuracies, may output
correlated labels, and may label different tasks or apply at
different levels of granularity. We propose a framework for
integrating and modeling such weak supervision sources by
viewing them as labeling different related sub-tasks of a
problem, which we refer to as the multi-task weak supervision
setting. We show that by solving a matrix completion-style
problem, we can recover the accuracies of these multi-task
sources given their dependency structure, but without any
labeled data, leading to higher-quality supervision for
training an end model. Theoretically, we show that the
generalization error of models trained with this approach
improves with the number of unlabeled data points, and
characterize the scaling with respect to the task and
dependency structures. On three fine-grained classification
problems, we show that our approach leads to average gains of
20.2 points in accuracy over a traditional supervised
approach, 6.8 points over a majority vote baseline, and 4.1
points over a previously proposed weak supervision method
that models tasks separately.",
archivePrefix= "arXiv",
eprint = "1810.02840",
primaryClass = "stat.ML"
}
@article{gong-2017-ruminating-reader,
author = "Gong, Yichen and Bowman, Samuel R.",
title = "Ruminating Reader: Reasoning With Gated Multi-Hop Attention",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1704.07415v1",
abstract = "To answer the question in machine comprehension (MC) task,
the models need to establish the interaction between the
question and the context. To tackle the problem that the
single-pass model cannot reflect on and correct its answer,
we present Ruminating Reader. Ruminating Reader adds a second
pass of attention and a novel information fusion component to
the Bi-Directional Attention Flow model (BiDAF). We propose
novel layer structures that construct an query-aware context
vector representation and fuse encoding representation with
intermediate representation on top of BiDAF model. We show
that a multi-hop attention mechanism can be applied to a
bi-directional attention structure. In experiments on SQuAD,
we find that the Reader outperforms the BiDAF baseline by a
substantial margin, and matches or surpasses the performance
of all other published systems.",
archivePrefix= "arXiv",
eprint = "1704.07415",
primaryClass = "cs.CL"
}
@inproceedings{williams-2018-multinli,
title = "A Broad-Coverage Challenge Corpus for Sentence Understanding
through Inference",
author = "Williams, Adina and Nangia, Nikita and Bowman, Samuel",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics:
Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = 2018,
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1101",
doi = "10.18653/v1/N18-1101",
pages = "1112-1122",
abstract = "This paper introduces the Multi-Genre Natural Language
Inference (MultiNLI) corpus, a dataset designed for use in
the development and evaluation of machine learning models for
sentence understanding. At 433k examples, this resource is
one of the largest corpora available for natural language
inference (a.k.a. recognizing textual entailment), improving
upon available resources in both its coverage and
difficulty. MultiNLI accomplishes this by offering data from
ten distinct genres of written and spoken English, making it
possible to evaluate systems on nearly the full complexity of
the language, while supplying an explicit setting for
evaluating cross-genre domain adaptation. In addition, an
evaluation using existing machine learning models designed
for the Stanford NLI corpus shows that it represents a
substantially more difficult task than does that corpus,
despite the two showing similar levels of inter-annotator
agreement."
}
@inproceedings{tomar-2017-decatt,
title = "Neural Paraphrase Identification of Questions with Noisy
Pretraining",
author = "Tomar, Gaurav Singh and Duque, Thyago and T{\"a}ckstr{\"o}m,
Oscar and Uszkoreit, Jakob and Das, Dipanjan",
booktitle = "Proceedings of the First Workshop on Subword and Character
Level Models in {NLP}",
month = sep,
year = 2017,
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W17-4121",
doi = "10.18653/v1/W17-4121",
pages = "142-147",
abstract = "We present a solution to the problem of paraphrase
identification of questions. We focus on a recent dataset of
question pairs annotated with binary paraphrase labels and
show that a variant of the decomposable attention model
(replacing the word embeddings of the decomposable attention
model of Parikh et al. 2016 with character n-gram
representations) results in accurate performance on this
task, while being far simpler than many competing neural
architectures. Furthermore, when the model is pretrained on a
noisy dataset of automatically collected question
paraphrases, it obtains the best reported performance on the
dataset."
}
@ARTICLE{wang-2020-match2,
author = "{Wang}, Zizhen and {Fan}, Yixing and {Guo}, Jiafeng and
{Yang}, Liu and {Zhang}, Ruqing and {Lan}, Yanyan and
{Cheng}, Xueqi and {Jiang}, Hui and {Wang}, Xiaozhao",
title = "{Match$^2$: A Matching over Matching Model for Similar
Question Identification}",
journal = "arXiv e-prints",
keywords = "Computer Science - Information Retrieval, Computer Science -
Computation and Language",
year = 2020,
month = jun,
eid = "arXiv:2006.11719",
pages = "arXiv:2006.11719",
archivePrefix= "arXiv",
eprint = "2006.11719",
primaryClass = "cs.IR",
adsurl = "https://ui.adsabs.harvard.edu/abs/2020arXiv200611719W",
adsnote = "Provided by the SAO/NASA Astrophysics Data System"
}
@inproceedings{gupta-2019-faq-attentive-matching,
author = "Gupta, Sparsh and Carvalho, Vitor R.",
title = "FAQ Retrieval Using Attentive Matching",
year = 2019,
isbn = 9781450361729,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/3331184.3331294",
doi = "10.1145/3331184.3331294",
booktitle = "Proceedings of the 42nd International ACM SIGIR Conference on
Research and Development in Information Retrieval",
pages = "929–932",
numpages = 4,
keywords = "neural networks, attention mechanism, learning to rank",
location = "Paris, France",
series = "SIGIR’19"
}
@inproceedings{ji-2012-qa-topic-model,
author = "Ji, Zongcheng and Xu, Fei and Wang, Bin and He, Ben",
title = "Question-Answer Topic Model for Question Retrieval in
Community Question Answering",
year = 2012,
isbn = 9781450311564,
publisher = "Association for Computing Machinery",
address = "New York, NY, USA",
url = "https://doi.org/10.1145/2396761.2398669",
doi = "10.1145/2396761.2398669",
booktitle = "Proceedings of the 21st ACM International Conference on
Information and Knowledge Management",
pages = "2471–2474",
numpages = 4,
keywords = "community question answering, question-answer topic model,
question retrieval, topic model, translation model",
location = "Maui, Hawaii, USA",
series = "CIKM ’12"
}
@article{sakata-2019-faq-retrieval,
author = "Sakata, Wataru and Shibata, Tomohide and Tanaka, Ribeka and
Kurohashi, Sadao",
title = "Faq Retrieval Using Query-Question Similarity and Bert-Based
Query-Answer Relevance",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1905.02851v2",
abstract = "Frequently Asked Question (FAQ) retrieval is an important
task where the objective is to retrieve an appropriate
Question-Answer (QA) pair from a database based on a user's
query. We propose a FAQ retrieval system that considers the
similarity between a user's query and a question as well as
the relevance between the query and an answer. Although a
common approach to FAQ retrieval is to construct labeled data
for training, it takes annotation costs. Therefore, we use a
traditional unsupervised information retrieval system to
calculate the similarity between the query and question. On
the other hand, the relevance between the query and answer
can be learned by using QA pairs in a FAQ database. The
recently-proposed BERT model is used for the relevance
calculation. Since the number of QA pairs in FAQ page is not
enough to train a model, we cope with this issue by
leveraging FAQ sets that are similar to the one in
question. We evaluate our approach on two datasets. The first
one is localgovFAQ, a dataset we construct in a Japanese
administrative municipality domain. The second is
StackExchange dataset, which is the public dataset in
English. We demonstrate that our proposed method outperforms
baseline methods on these datasets.",
archivePrefix= "arXiv",
eprint = "1905.02851",
primaryClass = "cs.IR"
}
@InProceedings{damani-2020-optimized-transformer-faq,
author = "Damani, Sonam and Narahari, Kedhar Nath and Chatterjee,
Ankush and Gupta, Manish and Agrawal, Puneet",
editor = "Lauw, Hady W. and Wong, Raymond Chi-Wing and Ntoulas,
Alexandros and Lim, Ee-Peng and Ng, See-Kiong and Pan, Sinno
Jialin",
title = "Optimized Transformer Models for FAQ Answering",
booktitle = "Advances in Knowledge Discovery and Data Mining",
year = 2020,
publisher = "Springer International Publishing",
address = "Cham",
pages = "235-248",
abstract = "Informational chatbots provide a highly effective medium for
improving operational efficiency in answering customer
queries for any enterprise. Chatbots are also preferred by
users/customers since unlike other alternatives like calling
customer care or browsing over FAQ pages, chatbots provide
instant responses, are easy to use, are less invasive and are
always available. In this paper, we discuss the problem of
FAQ answering which is central to designing a retrieval-based
informational chatbot. Given a set of FAQ pages s for an
enterprise, and a user query, we need to find the best
matching question-answer pairs from s. Building such a
semantic ranking system that works well across domains for
large QA databases with low runtime and model size is
challenging. Previous work based on feature engineering or
recurrent neural models either provides low accuracy or
incurs high runtime costs. We experiment with multiple
transformer based deep learning models, and also propose a
novel MT-DNN (Multi-task Deep Neural Network)-based
architecture, which we call Masked MT-DNN (or
MMT-DNN). MMT-DNN significantly outperforms other
state-of-the-art transformer models for the FAQ answering
task. Further, we propose an improved knowledge distillation
component to achieve {\$}{\$}{\backslash}sim {\$}{\$}2.4x
reduction in model-size and {\$}{\$}{\backslash}sim
{\$}{\$}7x reduction in runtime while maintaining similar
accuracy. On a small benchmark dataset from SemEval 2017 CQA
Task 3, we show that our approach provides an NDCG@1 of
83.1. On another large dataset of {\$}{\$}{\backslash}sim
{\$}{\$}281K instances corresponding to
{\$}{\$}{\backslash}sim {\$}{\$}30K queries from diverse
domains, our distilled 174 MB model provides an NDCG@1 of
75.08 with a CPU runtime of mere 31 ms establishing a new
state-of-the-art for FAQ answering.",
isbn = "978-3-030-47426-3"
}
@incollection{ba-2014-do-deep,
title = "Do Deep Nets Really Need to be Deep?",
author = "Ba, Jimmy and Caruana, Rich",
booktitle = "Advances in Neural Information Processing Systems 27",
editor = "Z. Ghahramani and M. Welling and C. Cortes and N. D. Lawrence
and K. Q. Weinberger",
pages = "2654-2662",
year = 2014,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/5484-do-deep-nets-really-need-to-be-deep.pdf"
}
@article{mirzadeh-2019-teacher-assistant,
author = "Mirzadeh, Seyed-Iman and Farajtabar, Mehrdad and Li, Ang and
Levine, Nir and Matsukawa, Akihiro and Ghasemzadeh, Hassan",
title = "Improved Knowledge Distillation Via Teacher Assistant",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1902.03393v2",
abstract = "Despite the fact that deep neural networks are powerful
models and achieve appealing results on many tasks, they are
too large to be deployed on edge devices like smartphones or
embedded sensor nodes. There have been efforts to compress
these networks, and a popular method is knowledge
distillation, where a large (teacher) pre-trained network is
used to train a smaller (student) network. However, in this
paper, we show that the student network performance degrades
when the gap between student and teacher is large. Given a
fixed student network, one cannot employ an arbitrarily large
teacher, or in other words, a teacher can effectively
transfer its knowledge to students up to a certain size, not
smaller. To alleviate this shortcoming, we introduce
multi-step knowledge distillation, which employs an
intermediate-sized network (teacher assistant) to bridge the
gap between the student and the teacher. Moreover, we study
the effect of teacher assistant size and extend the framework
to multi-step distillation. Theoretical analysis and
extensive experiments on CIFAR-10,100 and ImageNet datasets
and on CNN and ResNet architectures substantiate the
effectiveness of our proposed approach.",
archivePrefix= "arXiv",
eprint = "1902.03393",
primaryClass = "cs.LG"
}
@article{schulz-2002-fast-string-correction,
title = "Fast string correction with Levenshtein automata",
author = "Schulz, Klaus U and Mihov, Stoyan",
journal = "International Journal on Document Analysis and Recognition",
volume = 5,
number = 1,
pages = "67-85",
year = 2002,
publisher = "Springer"
}
@article{mihov-2004-fast-approx-search,
title = "Fast Approximate Search in Large Dictionaries",
author = "Mihov, Stoyan and Schulz, Klaus U.",
journal = "Computational Linguistics",
volume = 30,
number = 4,
year = 2004,
url = "https://www.aclweb.org/anthology/J04-4003",
doi = "10.1162/0891201042544938",
pages = "451-477"
}
@inproceedings{lei-2018-sru,
title = "Simple Recurrent Units for Highly Parallelizable Recurrence",
author = "Lei, Tao and Zhang, Yu and Wang, Sida I. and Dai, Hui and
Artzi, Yoav",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing",
month = oct # "-" # nov,
year = 2018,
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D18-1477",
doi = "10.18653/v1/D18-1477",
pages = "4470-4481",
abstract = "Common recurrent neural architectures scale poorly due to the
intrinsic difficulty in parallelizing their state
computations. In this work, we propose the Simple Recurrent
Unit (SRU), a light recurrent unit that balances model
capacity and scalability. SRU is designed to provide
expressive recurrence, enable highly parallelized
implementation, and comes with careful initialization to
facilitate training of deep models. We demonstrate the
effectiveness of SRU on multiple NLP tasks. SRU achieves
5{---}9x speed-up over cuDNN-optimized LSTM on classification
and question answering datasets, and delivers stronger
results than LSTM and convolutional models. We also obtain an
average of 0.7 BLEU improvement over the Transformer model
(Vaswani et al., 2017) on translation by incorporating SRU
into the architecture."
}
@article{xiong-2017-dcn,
author = "Xiong, Caiming and Zhong, Victor and Socher, Richard",
title = "Dcn+: Mixed Objective and Deep Residual Coattention for
Question Answering",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1711.00106v2",
abstract = "Traditional models for question answering optimize using
cross entropy loss, which encourages exact answers at the
cost of penalizing nearby or overlapping answers that are
sometimes equally accurate. We propose a mixed objective that
combines cross entropy loss with self-critical policy
learning. The objective uses rewards derived from word
overlap to solve the misalignment between evaluation metric
and optimization objective. In addition to the mixed
objective, we improve dynamic coattention networks (DCN) with
a deep residual coattention encoder that is inspired by
recent work in deep self-attention and residual networks. Our
proposals improve model performance across question types and
input lengths, especially for long questions that requires
the ability to capture long-term dependencies. On the
Stanford Question Answering Dataset, our model achieves
state-of-the-art results with 75.1 \% exact match accuracy
and 83.1 \% F1, while the ensemble obtains 78.9 \% exact
match accuracy and 86.0 \% F1.",
archivePrefix= "arXiv",
eprint = "1711.00106",
primaryClass = "cs.CL"
}
@article{andreas-2015-neural-module-networks,
author = "Andreas, Jacob and Rohrbach, Marcus and Darrell, Trevor and
Klein, Dan",
title = "Neural Module Networks",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1511.02799v4",
abstract = "Visual question answering is fundamentally compositional in
nature---a question like ``where is the dog?`` shares
substructure with questions like ``what color is the dog?``
and ``where is the cat?`` This paper seeks to simultaneously
exploit the representational capacity of deep networks and
the compositional linguistic structure of questions. We
describe a procedure for constructing and learning *neural
module networks*, which compose collections of
jointly-trained neural ``modules`` into deep networks for
question answering. Our approach decomposes questions into
their linguistic substructures, and uses these structures to
dynamically instantiate modular networks (with reusable
components for recognizing dogs, classifying colors,
etc.). The resulting compound networks are jointly
trained. We evaluate our approach on two challenging datasets
for visual question answering, achieving state-of-the-art
results on both the VQA natural image dataset and a new
dataset of complex questions about abstract shapes.",
archivePrefix= "arXiv",
eprint = "1511.02799",
primaryClass = "cs.CV"
}
@article{bao-2020-unilmv2,
author = "Bao, Hangbo and Dong, Li and Wei, Furu and Wang, Wenhui and
Yang, Nan and Liu, Xiaodong and Wang, Yu and Piao, Songhao
and Gao, Jianfeng and Zhou, Ming and Hon, Hsiao-Wuen",
title = "Unilmv2: Pseudo-Masked Language Models for Unified Language
Model Pre-Training",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2002.12804v1",
abstract = "We propose to pre-train a unified language model for both
autoencoding and partially autoregressive language modeling
tasks using a novel training procedure, referred to as a
pseudo-masked language model (PMLM). Given an input text with
masked tokens, we rely on conventional masks to learn
inter-relations between corrupted tokens and context via
autoencoding, and pseudo masks to learn intra-relations
between masked spans via partially autoregressive
modeling. With well-designed position embeddings and
self-attention masks, the context encodings are reused to
avoid redundant computation. Moreover, conventional masks
used for autoencoding provide global masking information, so
that all the position embeddings are accessible in partially
autoregressive language modeling. In addition, the two tasks
pre-train a unified language model as a bidirectional encoder
and a sequence-to-sequence decoder, respectively. Our
experiments show that the unified language models pre-trained
using PMLM achieve new state-of-the-art results on a wide
range of natural language understanding and generation tasks
across several widely used benchmarks.",
archivePrefix= "arXiv",
eprint = "2002.12804",
primaryClass = "cs.CL"
}
@article{humeau-2019-poly-encoders,
author = "Humeau, Samuel and Shuster, Kurt and Lachaux, Marie-Anne and
Weston, Jason",
title = "Poly-Encoders: Transformer Architectures and Pre-Training
Strategies for Fast and Accurate Multi-Sentence Scoring",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1905.01969v4",
abstract = "The use of deep pre-trained bidirectional transformers has
led to remarkable progress in a number of applications
(Devlin et al., 2018). For tasks that make pairwise
comparisons between sequences, matching a given input with a
corresponding label, two approaches are common:
Cross-encoders performing full self-attention over the pair
and Bi-encoders encoding the pair separately. The former
often performs better, but is too slow for practical use. In
this work, we develop a new transformer architecture, the
Poly-encoder, that learns global rather than token level
self-attention features. We perform a detailed comparison of
all three approaches, including what pre-training and
fine-tuning strategies work best. We show our models achieve
state-of-the-art results on three existing tasks; that
Poly-encoders are faster than Cross-encoders and more
accurate than Bi-encoders; and that the best results are
obtained by pre-training on large datasets similar to the
downstream tasks.",
archivePrefix= "arXiv",
eprint = "1905.01969",
primaryClass = "cs.CL"
}
@article{wang-2020-multi-level,
author = "Wang, Shuohang and Lan, Yunshi and Tay, Yi and Jiang, Jing
and Liu, Jingjing",
title = "Multi-Level Head-Wise Match and Aggregation in Transformer
for Textual Sequence Matching",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2001.07234v1",
abstract = "Transformer has been successfully applied to many natural
language processing tasks. However, for textual sequence
matching, simple matching between the representation of a
pair of sequences might bring in unnecessary noise. In this
paper, we propose a new approach to sequence pair matching
with Transformer, by learning head-wise matching
representations on multiple levels. Experiments show that our
proposed approach can achieve new state-of-the-art
performance on multiple tasks that rely only on pre-computed
sequence-vector-representation, such as SNLI, MNLI-match,
MNLI-mismatch, QQP, and SQuAD-binary.",
archivePrefix= "arXiv",
eprint = "2001.07234",
primaryClass = "cs.CL"
}
@article{raffel-2015-feed-forwar,
author = "Raffel, Colin and Ellis, Daniel P. W.",
title = "Feed-Forward Networks With Attention Can Solve Some Long-Term
Memory Problems",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1512.08756v5",
abstract = "We propose a simplified model of attention which is
applicable to feed-forward neural networks and demonstrate
that the resulting model can solve the synthetic ``addition``
and ``multiplication`` long-term memory problems for sequence
lengths which are both longer and more widely varying than
the best published results for these tasks.",
archivePrefix= "arXiv",
eprint = "1512.08756",
primaryClass = "cs.LG"
}
@article{singh-2016-black-box,
author = "Singh, Sameer and Ribeiro, Marco Tulio and Guestrin, Carlos",
title = "Programs As Black-Box Explanations",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1611.07579v1",
abstract = "Recent work in model-agnostic explanations of black-box
machine learning has demonstrated that interpretability of
complex models does not have to come at the cost of accuracy
or model flexibility. However, it is not clear what kind of
explanations, such as linear models, decision trees, and rule
lists, are the appropriate family to consider, and different
tasks and models may benefit from different kinds of
explanations. Instead of picking a single family of
representations, in this work we propose to use ``programs``
as model-agnostic explanations. We show that small programs
can be expressive yet intuitive as explanations, and
generalize over a number of existing interpretable families.
We propose a prototype program induction method based on
simulated annealing that approximates the local behavior of
black-box classifiers around a specific prediction using
random perturbations. Finally, we present preliminary
application on small datasets and show that the generated
explanations are intuitive and accurate for a number of
classifiers.",
archivePrefix= "arXiv",
eprint = "1611.07579",
primaryClass = "stat.ML"
}
@article{ribeiro-2016-nothing-else,
author = "Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos",
title = "Nothing Else Matters: Model-Agnostic Explanations By
Identifying Prediction Invariance",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1611.05817v1",
abstract = "At the core of interpretable machine learning is the question
of whether humans are able to make accurate predictions about
a model's behavior. Assumed in this question are three
properties of the interpretable output: coverage, precision,
and effort. Coverage refers to how often humans think they
can predict the model's behavior, precision to how accurate
humans are in those predictions, and effort is either the
up-front effort required in interpreting the model, or the
effort required to make predictions about a model's behavior.
In this work, we propose anchor-LIME (aLIME), a
model-agnostic technique that produces high-precision
rule-based explanations for which the coverage boundaries are
very clear. We compare aLIME to linear LIME with simulated
experiments, and demonstrate the flexibility of aLIME with
qualitative examples from a variety of domains and tasks.",
archivePrefix= "arXiv",
eprint = "1611.05817",
primaryClass = "stat.ML"
}
@article{ribeiro-2016-model-agnostic,
author = "Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos",
title = "Model-Agnostic Interpretability of Machine Learning",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1606.05386v1",
abstract = "Understanding why machine learning models behave the way they
do empowers both system designers and end-users in many ways:
in model selection, feature engineering, in order to trust
and act upon the predictions, and in more intuitive user
interfaces. Thus, interpretability has become a vital concern
in machine learning, and work in the area of interpretable
models has found renewed interest. In some applications, such
models are as accurate as non-interpretable ones, and thus
are preferred for their transparency. Even when they are not
accurate, they may still be preferred when interpretability
is of paramount importance. However, restricting machine
learning to interpretable models is often a severe
limitation. In this paper we argue for explaining machine
learning predictions using model-agnostic approaches. By
treating the machine learning models as black-box functions,
these approaches provide crucial flexibility in the choice of
models, explanations, and representations, improving
debugging, comparison, and interfaces for a variety of users
and models. We also outline the main challenges for such
methods, and review a recently-introduced model-agnostic
explanation approach (LIME) that addresses these challenges.",
archivePrefix= "arXiv",
eprint = "1606.05386",
primaryClass = "stat.ML"
}
@article{alvarez-melis-2018-robustness,
author = "Alvarez-Melis, David and Jaakkola, Tommi S.",
title = "On the Robustness of Interpretability Methods",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1806.08049v1",
abstract = "We argue that robustness of explanations---i.e., that similar
inputs should give rise to similar explanations---is a key
desideratum for interpretability. We introduce metrics to
quantify robustness and demonstrate that current methods do
not perform well according to these metrics. Finally, we
propose ways that robustness can be enforced on existing
interpretability approaches.",
archivePrefix= "arXiv",
eprint = "1806.08049",
primaryClass = "cs.LG"
}
@inproceedings{ribeiro-2018-anchors,
title = "Anchors: High-Precision Model-Agnostic Explanations",
author = "Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin",
booktitle = "AAAI",
year = 2018
}
@article{wu-2017-beyond-sparsity,
author = "Wu, Mike and Hughes, Michael C. and Parbhoo, Sonali and
Zazzi, Maurizio and Roth, Volker and Doshi-Velez, Finale",
title = "Beyond Sparsity: Tree Regularization of Deep Models for
Interpretability",
journal = "CoRR",
year = 2017,
url = "http://arxiv.org/abs/1711.06178v1",
abstract = "The lack of interpretability remains a key barrier to the
adoption of deep models in many applications. In this work,
we explicitly regularize deep models so human users might
step through the process behind their predictions in little
time. Specifically, we train deep time-series models so their
class-probability predictions have high accuracy while being
closely modeled by decision trees with few nodes. Using
intuitive toy examples as well as medical tasks for treating
sepsis and HIV, we demonstrate that this new tree
regularization yields models that are easier for humans to
simulate than simpler L1 or L2 penalties without sacrificing
predictive power.",
archivePrefix= "arXiv",
eprint = "1711.06178",
primaryClass = "stat.ML"
}
@article{zhou-2015-cam,
author = "Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and
Oliva, Aude and Torralba, Antonio",
title = "Learning Deep Features for Discriminative Localization",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1512.04150v1",
abstract = "In this work, we revisit the global average pooling layer
proposed in [13], and shed light on how it explicitly enables
the convolutional neural network to have remarkable
localization ability despite being trained on image-level
labels. While this technique was previously proposed as a
means for regularizing training, we find that it actually
builds a generic localizable deep representation that can be
applied to a variety of tasks. Despite the apparent
simplicity of global average pooling, we are able to achieve
37.1 \% top-5 error for object localization on ILSVRC 2014,
which is remarkably close to the 34.2 \% top-5 error achieved
by a fully supervised CNN approach. We demonstrate that our
network is able to localize the discriminative image regions
on a variety of tasks despite not being trained for them",
archivePrefix= "arXiv",
eprint = "1512.04150",
primaryClass = "cs.CV"
}
@article{selvaraju-2016-grad-cam,
author = "Selvaraju, Ramprasaath R. and Cogswell, Michael and Das,
Abhishek and Vedantam, Ramakrishna and Parikh, Devi and
Batra, Dhruv",
title = "Grad-Cam: Visual Explanations From Deep Networks Via
Gradient-Based Localization",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1610.02391v4",
abstract = "We propose a technique for producing ``visual explanations``
for decisions from a large class of CNN-based models, making
them more transparent. Our approach - Gradient-weighted Class
Activation Mapping (Grad-CAM), uses the gradients of any
target concept, flowing into the final convolutional layer to
produce a coarse localization map highlighting important
regions in the image for predicting the concept. Grad-CAM is
applicable to a wide variety of CNN model-families: (1) CNNs
with fully-connected layers, (2) CNNs used for structured
outputs, (3) CNNs used in tasks with multimodal inputs or
reinforcement learning, without any architectural changes or
re-training. We combine Grad-CAM with fine-grained
visualizations to create a high-resolution
class-discriminative visualization and apply it to
off-the-shelf image classification, captioning, and visual
question answering (VQA) models, including ResNet-based
architectures. In the context of image classification models,
our visualizations (a) lend insights into their failure
modes, (b) are robust to adversarial images, (c) outperform
previous methods on localization, (d) are more faithful to
the underlying model and (e) help achieve generalization by
identifying dataset bias. For captioning and VQA, we show
that even non-attention based models can localize inputs. We
devise a way to identify important neurons through Grad-CAM
and combine it with neuron names to provide textual
explanations for model decisions. Finally, we design and
conduct human studies to measure if Grad-CAM helps users
establish appropriate trust in predictions from models and
show that Grad-CAM helps untrained users successfully discern
a `stronger` nodel from a `weaker` one even when both make
identical predictions. Our code is available at
https://github.com/ramprs/grad-cam/, along with a demo at
http://gradcam.cloudcv.org, and a video at
youtu.be/COjUB9Izk6E.",
archivePrefix= "arXiv",
eprint = "1610.02391",
primaryClass = "cs.CV"
}
@article{baehrens-2010-parzen,
author = {Baehrens, David and Schroeter, Timon and Harmeling, Stefan
and Kawanabe, Motoaki and Hansen, Katja and M\"{u}ller,
Klaus-Robert},
title = "How to Explain Individual Classification Decisions",
year = 2010,
issue_date = "3/1/2010",
publisher = "JMLR.org",
volume = 11,
issn = "1532-4435",
journal = "J. Mach. Learn. Res.",
month = aug,
pages = "1803–1831",
numpages = 29
}
@article{mcinnes-2017-hdbscan,
title = "hdbscan: Hierarchical density based clustering",
author = "McInnes, Leland and Healy, John and Astels, Steve",
journal = "Journal of Open Source Software",
volume = 2,
number = 11,
pages = 205,
year = 2017
}
@article{osborne-2013-data-clearning,
title = "Is data cleaning and the testing of assumptions relevant in
the 21st century?",
author = "Osborne, Jason W",
journal = "Frontiers in Psychology",
volume = 4,
pages = 370,
year = 2013,
publisher = "Frontiers"
}
@inproceedings{fu-2019-graph-rel,
author = "Fu, Tsu-Jui and Li, Peng-Hsuan and Ma, Wei-Yun",
title = "{G}raph{R}el: Modeling Text as Relational Graphs for Joint
Entity and Relation Extraction",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
year = 2019,
pages = "1409-1418",
doi = "10.18653/v1/P19-1136",
url = "https://doi.org/10.18653/v1/P19-1136",
abstract = "In this paper, we present GraphRel, an end-to-end relation
extraction model which uses graph convolutional networks
(GCNs) to jointly learn named entities and relations. In
contrast to previous baselines, we consider the interaction
between named entities and relations via a 2nd-phase
relation-weighted GCN to better extract relations. Linear and
dependency structures are both used to extract both
sequential and regional features of the text, and a complete
word graph is further utilized to extract implicit features
among all word pairs of the text. With the graph-based
approach, the prediction for overlapping relations is
substantially improved over previous sequential
approaches. We evaluate GraphRel on two public datasets: NYT
and WebNLG. Results show that GraphRel maintains high
precision while increasing recall substantially. Also,
GraphRel outperforms previous work by 3.2{\%} and 5.8{\%} (F1
score), achieving a new state-of-the-art for relation
extraction.",
address = "Florence, Italy",
month = jul,
publisher = "Association for Computational Linguistics"
}
@article{xie-2019-uda,
author = "Xie, Qizhe and Dai, Zihang and Hovy, Eduard and Luong,
Minh-Thang and Le, Quoc V.",
title = "Unsupervised Data Augmentation for Consistency Training",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1904.12848v5",
abstract = "Semi-supervised learning lately has shown much promise in
improving deep learning models when labeled data is
scarce. Common among recent approaches is the use of
consistency training on a large amount of unlabeled data to
constrain model predictions to be invariant to input
noise. In this work, we present a new perspective on how to
effectively noise unlabeled examples and argue that the
quality of noising, specifically those produced by advanced
data augmentation methods, plays a crucial role in
semi-supervised learning. By substituting simple noising
operations with advanced data augmentation methods such as
RandAugment and back-translation, our method brings
substantial improvements across six language and three vision
tasks under the same consistency training framework. On the
IMDb text classification dataset, with only 20 labeled
examples, our method achieves an error rate of 4.20,
outperforming the state-of-the-art model trained on 25,000
labeled examples. On a standard semi-supervised learning
benchmark, CIFAR-10, our method outperforms all previous
approaches and achieves an error rate of 5.43 with only 250
examples. Our method also combines well with transfer
learning, e.g., when finetuning from BERT, and yields
improvements in high-data regime, such as ImageNet, whether
when there is only 10 \% labeled data or when a full labeled
set with 1.3M extra unlabeled examples is used. Code is
available at https://github.com/google-research/uda.",
archivePrefix= "arXiv",
eprint = "1904.12848",
primaryClass = "cs.LG"
}
@article{cubuk-2019-randaugment,
author = "Cubuk, Ekin D. and Zoph, Barret and Shlens, Jonathon and Le,
Quoc V.",
title = "Randaugment: Practical Automated Data Augmentation With a
Reduced Search Space",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1909.13719v2",
abstract = "Recent work has shown that data augmentation has the
potential to significantly improve the generalization of deep
learning models. Recently, automated augmentation strategies
have led to state-of-the-art results in image classification
and object detection. While these strategies were optimized
for improving validation accuracy, they also led to
state-of-the-art results in semi-supervised learning and
improved robustness to common corruptions of images. An
obstacle to a large-scale adoption of these methods is a
separate search phase which increases the training complexity
and may substantially increase the computational
cost. Additionally, due to the separate search phase, these
approaches are unable to adjust the regularization strength
based on model or dataset size. Automated augmentation
policies are often found by training small models on small
datasets and subsequently applied to train larger models. In
this work, we remove both of these obstacles. RandAugment has
a significantly reduced search space which allows it to be
trained on the target task with no need for a separate proxy
task. Furthermore, due to the parameterization, the
regularization strength may be tailored to different model
and dataset sizes. RandAugment can be used uniformly across
different tasks and datasets and works out of the box,
matching or surpassing all previous automated augmentation
approaches on CIFAR-10/100, SVHN, and ImageNet. On the
ImageNet dataset we achieve 85.0 \% accuracy, a 0.6 \%
increase over the previous state-of-the-art and 1.0 \%
increase over baseline augmentation. On object detection,
RandAugment leads to 1.0-1.3 \% improvement over baseline
augmentation, and is within 0.3 \% mAP of AutoAugment on
COCO. Finally, due to its interpretable hyperparameter,
RandAugment may be used to investigate the role of data
augmentation with varying model and dataset size. Code is
available online.",
archivePrefix= "arXiv",
eprint = "1909.13719",
primaryClass = "cs.CV"
}
@article{pan-2020-adversarial-validation,
author = "Pan, Jing and Pham, Vincent and Dorairaj, Mohan and Chen,
Huigang and Lee, Jeong-Yoon",
title = "Adversarial Validation Approach To Concept Drift Problem in
User Targeting Automation Systems At Uber",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2004.03045v2",
abstract = "In user targeting automation systems, concept drift in input
data is one of the main challenges. It deteriorates model
performance on new data over time. Previous research on
concept drift mostly proposed model retraining after
observing performance decreases. However, this approach is
suboptimal because the system fixes the problem only after
suffering from poor performance on new data. Here, we
introduce an adversarial validation approach to concept drift
problems in user targeting automation systems. With our
approach, the system detects concept drift in new data before
making inference, trains a model, and produces predictions
adapted to the new data. We show that our approach addresses
concept drift effectively with the AutoML3 Lifelong Machine
Learning challenge data as well as in Uber's internal user
targeting automation system, MaLTA.",
archivePrefix= "arXiv",
eprint = "2004.03045",
primaryClass = "cs.LG"
}
@article{lin-2019-unknown-detection,
author = "Lin, Ting-En and Xu, Hua",
title = "Deep Unknown Intent Detection With Margin Loss",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1906.00434v1",
abstract = "Identifying the unknown (novel) user intents that have never
appeared in the training set is a challenging task in the
dialogue system. In this paper, we present a two-stage method
for detecting unknown intents. We use bidirectional long
short-term memory (BiLSTM) network with the margin loss as
the feature extractor. With margin loss, we can learn
discriminative deep features by forcing the network to
maximize inter-class variance and to minimize intra-class
variance. Then, we feed the feature vectors to the
density-based novelty detection algorithm, local outlier
factor (LOF), to detect unknown intents. Experiments on two
benchmark datasets show that our method can yield consistent
improvements compared with the baseline methods.",
archivePrefix= "arXiv",
eprint = "1906.00434",
primaryClass = "cs.CL"
}
@article{tompson-2014-spatial-dropout,
author = "Tompson, Jonathan and Goroshin, Ross and Jain, Arjun and
LeCun, Yann and Bregler, Christopher",
title = "Efficient Object Localization Using Convolutional Networks",
journal = "CoRR",
year = 2014,
url = "http://arxiv.org/abs/1411.4280v3",
abstract = "Recent state-of-the-art performance on human-body pose
estimation has been achieved with Deep Convolutional Networks
(ConvNets). Traditional ConvNet architectures include pooling
and sub-sampling layers which reduce computational
requirements, introduce invariance and prevent over-training.
These benefits of pooling come at the cost of reduced
localization accuracy. We introduce a novel architecture
which includes an efficient `position refinement' model that
is trained to estimate the joint offset location within a
small region of the image. This refinement model is jointly
trained in cascade with a state-of-the-art ConvNet model to
achieve improved accuracy in human joint location
estimation. We show that the variance of our detector
approaches the variance of human annotations on the FLIC
dataset and outperforms all existing approaches on the
MPII-human-pose dataset.",
archivePrefix= "arXiv",
eprint = "1411.4280",
primaryClass = "cs.CV"
}
@article{yang-2018-rethinking-structure,
author = "Yang, Yao-Yuan and Lin, Yi-An and Chu, Hong-Min and Lin,
Hsuan-Tien",
title = "Deep Learning With a Rethinking Structure for Multi-Label
Classification",
journal = "CoRR",
year = 2018,
url = "http://arxiv.org/abs/1802.01697v2",
abstract = "Multi-label classification (MLC) is an important class of
machine learning problems that come with a wide spectrum of
applications, each demanding a possibly different evaluation
criterion. When solving the MLC problems, we generally expect
the learning algorithm to take the hidden correlation of the
labels into account to improve the prediction
performance. Extracting the hidden correlation is generally a
challenging task. In this work, we propose a novel deep
learning framework to better extract the hidden correlation
with the help of the memory structure within recurrent neural
networks. The memory stores the temporary guesses on the
labels and effectively allows the framework to rethink about
the goodness and correlation of the guesses before making the
final prediction. Furthermore, the rethinking process makes
it easy to adapt to different evaluation criteria to match
real-world application needs. In particular, the framework
can be trained in an end-to-end style with respect to any
given MLC evaluation criteria. The end-to-end design can be
seamlessly combined with other deep learning techniques to
conquer challenging MLC problems like image
tagging. Experimental results across many real-world data
sets justify that the rethinking framework indeed improves
MLC performance across different evaluation criteria and
leads to superior performance over state-of-the-art MLC
algorithms.",
archivePrefix= "arXiv",
eprint = "1802.01697",
primaryClass = "cs.LG"
}
@inproceedings{yang-2019-seq2set,
author = "Yang, Pengcheng and Luo, Fuli and Ma, Shuming and Lin,
Junyang and Sun, Xu",
title = "A Deep Reinforced Sequence-to-Set Model for Multi-Label
Classification",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for
Computational Linguistics",
year = 2019,
pages = "5252-5258",
doi = "10.18653/v1/P19-1518",
url = "https://doi.org/10.18653/v1/P19-1518",
abstract = "Multi-label classification (MLC) aims to predict a set of
labels for a given instance. Based on a pre-defined label
order, the sequence-to-sequence (Seq2Seq) model trained via
maximum likelihood estimation method has been successfully
applied to the MLC task and shows powerful ability to capture
high-order correlations between labels. However, the output
labels are essentially an unordered set rather than an
ordered sequence. This inconsistency tends to result in some
intractable problems, e.g., sensitivity to the label
order. To remedy this, we propose a simple but effective
sequence-to-set model. The proposed model is trained via
reinforcement learning, where reward feedback is designed to
be independent of the label order. In this way, we can reduce
the dependence of the model on the label order, as well as
capture high-order correlations between labels. Extensive
experiments show that our approach can substantially
outperform competitive baselines, as well as effectively
reduce the sensitivity to the label order.",
address = "Florence, Italy",
month = jul,
publisher = "Association for Computational Linguistics"
}
@ARTICLE{zhu-2018-label-correlation,
author = "Y. {Zhu} and J. T. {Kwok} and Z. {Zhou}",
journal = "IEEE Transactions on Knowledge and Data Engineering",
title = "Multi-Label Learning with Global and Local Label Correlation",
year = 2018,
volume = 30,
number = 6,
pages = "1081-1094"
}
@article{garg-2015-exploring-correlation,
author = "Garg, Amit and Noyola, Jonathan and Verma, Romil and Saxena,
Ashutosh and Jami, Aditya",
title = "Exploring Correlation Between Labels To Improve Multi-Label
Classification",
journal = "CoRR",
year = 2015,
url = "http://arxiv.org/abs/1511.07953v1",
abstract = "This paper attempts multi-label classification by extending
the idea of independent binary classification models for each
output label, and exploring how the inherent correlation
between output labels can be used to improve
predictions. Logistic Regression, Naive Bayes, Random Forest,
and SVM models were constructed, with SVM giving the best
results: an improvement of 12.9\% over binary models was
achieved for hold out cross validation by augmenting with
pairwise correlation probabilities of the labels.",
archivePrefix= "arXiv",
eprint = "1511.07953",
primaryClass = "cs.LG"
}
@inproceedings{huang-2012-multi-label,
author = "Huang, Sheng-Jun and Zhou, Zhi-Hua",
title = "Multi-Label Learning by Exploiting Label Correlations
Locally",
year = 2012,
publisher = "AAAI Press",
abstract = "It is well known that exploiting label correlations is
important for multi-label learning. Existing approaches
typically exploit label correlations globally, by assuming
that the label correlations are shared by all the
instances. In real-world tasks, however, different instances
may share different label correlations, and few correlations
are globally applicable. In this paper, we propose the ML-LOC
approach which allows label correlations to be exploited
locally. To encode the local influence of label correlations,
we derive a LOC code to enhance the feature representation of
each instance. The global discrimination fitting and local
correlation sensitivity are incorporated into a unified
framework, and an alternating solution is developed for the
optimization. Experimental results on a number of image, text
and gene data sets validate the effectiveness of our
approach.",
booktitle = "Proceedings of the Twenty-Sixth AAAI Conference on Artificial
Intelligence",
pages = "949–955",
numpages = 7,
location = "Toronto, Ontario, Canada",
series = "AAAI'12"
}
@article{li-2014-condensed-filter-tree,
author = "Chun-Liang Li and Hsuan-Tien Lin",
year = 2014,
month = 01,
pages = "663-673",
title = "Condensed filter tree for cost-sensitive multi-label
classification",
volume = 1,
journal = "31st International Conference on Machine Learning, ICML 2014"
}
@incollection{nam-2017-maximing-subset-accuracy,
title = "Maximizing Subset Accuracy with Recurrent Neural Networks in
Multi-label Classification",
author = {Nam, Jinseok and Loza Menc\'{\i}a, Eneldo and Kim, Hyunwoo J
and F\"{u}rnkranz, Johannes},
booktitle = "Advances in Neural Information Processing Systems 30",
editor = "I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett",
pages = "5413-5423",
year = 2017,
publisher = "Curran Associates, Inc.",
url =
"http://papers.nips.cc/paper/7125-maximizing-subset-accuracy-with-recurrent-neural-networks-in-multi-label-classification.pdf"
}
@article{rennie-2016-self-critical,
author = "Rennie, Steven J. and Marcheret, Etienne and Mroueh, Youssef
and Ross, Jarret and Goel, Vaibhava",
title = "Self-Critical Sequence Training for Image Captioning",
journal = "CoRR",
year = 2016,
url = "http://arxiv.org/abs/1612.00563v2",
abstract = "Recently it has been shown that policy-gradient methods for
reinforcement learning can be utilized to train deep
end-to-end systems directly on non-differentiable metrics for
the task at hand. In this paper we consider the problem of
optimizing image captioning systems using reinforcement
learning, and show that by carefully optimizing our systems
using the test metrics of the MSCOCO task, significant gains
in performance can be realized. Our systems are built using a
new optimization approach that we call self-critical sequence
training (SCST). SCST is a form of the popular REINFORCE
algorithm that, rather than estimating a ``baseline`` to
normalize the rewards and reduce variance, utilizes the
output of its own test-time inference algorithm to normalize
the rewards it experiences. Using this approach, estimating
the reward signal (as actor-critic methods must do) and
estimating normalization (as REINFORCE algorithms typically
do) is avoided, while at the same time harmonizing the model
with respect to its test-time inference
procedure. Empirically we find that directly optimizing the
CIDEr metric with SCST and greedy decoding at test-time is
highly effective. Our results on the MSCOCO evaluation sever
establish a new state-of-the-art on the task, improving the
best result in terms of CIDEr from 104.9 to 114.7.",
archivePrefix= "arXiv",
eprint = "1612.00563",
primaryClass = "cs.LG"
}
@techreport{settles-2009-active-learning,
title = "Active learning literature survey",
author = "Settles, Burr",
year = 2009,
institution = "University of Wisconsin-Madison Department of Computer
Sciences"
}
@incollection{aggarwal-2014-active-learning,
title = "Active learning: A survey",
author = "Aggarwal, Charu C and Kong, Xiangnan and Gu, Quanquan and
Han, Jiawei and Philip, S Yu",
booktitle = "Data Classification: Algorithms and Applications",
pages = "571-605",
year = 2014,
publisher = "CRC Press"
}
@article{tang-2019-distilling-bert,
author = "Tang, Raphael and Lu, Yao and Liu, Linqing and Mou, Lili and
Vechtomova, Olga and Lin, Jimmy",
title = "Distilling Task-Specific Knowledge From Bert Into Simple
Neural Networks",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1903.12136v1",
abstract = "In the natural language processing literature, neural
networks are becoming increasingly deeper and complex. The
recent poster child of this trend is the deep language
representation model, which includes BERT, ELMo, and
GPT. These developments have led to the conviction that
previous-generation, shallower neural networks for language
understanding are obsolete. In this paper, however, we
demonstrate that rudimentary, lightweight neural networks can
still be made competitive without architecture changes,
external training data, or additional input features. We
propose to distill knowledge from BERT, a state-of-the-art
language representation model, into a single-layer BiLSTM, as
well as its siamese counterpart for sentence-pair
tasks. Across multiple datasets in paraphrasing, natural
language inference, and sentiment classification, we achieve
comparable results with ELMo, while using roughly 100 times
fewer parameters and 15 times less inference time.",
archivePrefix= "arXiv",
eprint = "1903.12136",
primaryClass = "cs.CL"
}
@article{tay-2020-efficient-transformers,
author = "Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler,
Donald",
title = "Efficient Transformers: a Survey",
journal = "CoRR",
year = 2020,
url = "http://arxiv.org/abs/2009.06732v1",
abstract = "Transformer model architectures have garnered immense
interest lately due to their effectiveness across a range of
domains like language, vision and reinforcement learning. In
the field of natural language processing for example,
Transformers have become an indispensable staple in the
modern deep learning stack. Recently, a dizzying number of
``X-former`` models have been proposed - Reformer, Linformer,
Performer, Longformer, to name a few - which improve upon the
original Transformer architecture, many of which make
improvements around computational and memory efficiency. With
the aim of helping the avid researcher navigate this flurry,
this paper characterizes a large and thoughtful selection of
recent efficiency-flavored ``X-former`` models, providing an
organized and comprehensive overview of existing work and
models across multiple domains.",
archivePrefix= "arXiv",
eprint = "2009.06732",
primaryClass = "cs.LG"
}
@article{wei-2019-casrel,
author = "Wei, Zhepei and Su, Jianlin and Wang, Yue and Tian, Yuan and
Chang, Yi",
title = "A Novel Cascade Binary Tagging Framework for Relational
Triple Extraction",
journal = "CoRR",
year = 2019,
url = "http://arxiv.org/abs/1909.03227v4",
abstract = "Extracting relational triples from unstructured text is
crucial for large-scale knowledge graph
construction. However, few existing works excel in solving
the overlapping triple problem where multiple relational
triples in the same sentence share the same entities. In this
work, we introduce a fresh perspective to revisit the
relational triple extraction task and propose a novel cascade
binary tagging framework (CasRel) derived from a principled
problem formulation. Instead of treating relations as
discrete labels as in previous works, our new framework
models relations as functions that map subjects to objects in
a sentence, which naturally handles the overlapping
problem. Experiments show that the CasRel framework already
outperforms state-of-the-art methods even when its encoder
module uses a randomly initialized BERT encoder, showing the
power of the new tagging framework. It enjoys further
performance boost when employing a pre-trained BERT encoder,
outperforming the strongest baseline by 17.5 and 30.2
absolute gain in F1-score on two public datasets NYT and
WebNLG, respectively. In-depth analysis on different
scenarios of overlapping triples shows that the method
delivers consistent performance gain across all these
scenarios. The source code and data are released online.",
archivePrefix= "arXiv",
eprint = "1909.03227",
primaryClass = "cs.CL"
}
@inproceedings{ma-2020-simple-lexicon,
title = "Simplify the Usage of Lexicon in {C}hinese {NER}",
author = "Ma, Ruotian and Peng, Minlong and Zhang, Qi and Wei, Zhongyu
and Huang, Xuanjing",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for
Computational Linguistics",
month = jul,
year = 2020,
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/2020.acl-main.528",
doi = "10.18653/v1/2020.acl-main.528",
pages = "5951-5960",
abstract = "Recently, many works have tried to augment the performance of
Chinese named entity recognition (NER) using word
lexicons. As a representative, Lattice-LSTM has achieved new
benchmark results on several public Chinese NER
datasets. However, Lattice-LSTM has a complex model
architecture. This limits its application in many industrial
areas where real-time NER response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment