brando90/gist:57308c42004c333ad36f9cb3731530b5

## gistfile1.txt
@inproceedings{langley00,
 author    = {P. Langley},
 title     = {Crafting Papers on Machine Learning},
 year      = {2000},
 pages     = {1207--1216},
 editor    = {Pat Langley},
 booktitle     = {Proceedings of the 17th International Conference
              on Machine Learning (ICML 2000)},
 address   = {Stanford, CA},
 publisher = {Morgan Kaufmann}
}

@TechReport{mitchell80,
  author = 	 "T. M. Mitchell",
  title = 	 "The Need for Biases in Learning Generalizations",
  institution =  "Computer Science Department, Rutgers University",
  year = 	 "1980",
  address =	 "New Brunswick, MA",
}

@phdthesis{kearns89,
  author = {M. J. Kearns},
  title =  {Computational Complexity of Machine Learning},
  school = {Department of Computer Science, Harvard University},
  year =   {1989}
}

@Book{MachineLearningI,
  editor = 	 "R. S. Michalski and J. G. Carbonell and T.
		  M. Mitchell",
  title = 	 "Machine Learning: An Artificial Intelligence
		  Approach, Vol. I",
  publisher = 	 "Tioga",
  year = 	 "1983",
  address =	 "Palo Alto, CA"
}

@Book{DudaHart2nd,
  author =       "R. O. Duda and P. E. Hart and D. G. Stork",
  title =        "Pattern Classification",
  publisher =    "John Wiley and Sons",
  edition =      "2nd",
  year =         "2000"
}

@misc{anonymous,
  title= {Suppressed for Anonymity},
  author= {Author, N. N.},
  year= {2021}
}

@InCollection{Newell81,
  author =       "A. Newell and P. S. Rosenbloom",
  title =        "Mechanisms of Skill Acquisition and the Law of
                  Practice",
  booktitle =    "Cognitive Skills and Their Acquisition",
  pages =        "1--51",
  publisher =    "Lawrence Erlbaum Associates, Inc.",
  year =         "1981",
  editor =       "J. R. Anderson",
  chapter =      "1",
  address =      "Hillsdale, NJ"
}


@Article{Samuel59,
  author = 	 "A. L. Samuel",
  title = 	 "Some Studies in Machine Learning Using the Game of
		  Checkers",
  journal =	 "IBM Journal of Research and Development",
  year =	 "1959",
  volume =	 "3",
  number =	 "3",
  pages =	 "211--229"
}


@article{
wei2022emergent,
title={Emergent Abilities of Large Language Models},
author={Jason Wei and Yi Tay and Rishi Bommasani and Colin Raffel and Barret Zoph and Sebastian Borgeaud and Dani Yogatama and Maarten Bosma and Denny Zhou and Donald Metzler and Ed H. Chi and Tatsunori Hashimoto and Oriol Vinyals and Percy Liang and Jeff Dean and William Fedus},
journal={Transactions on Machine Learning Research},
year={2022},
url={https://openreview.net/forum?id=yzkSU5zdwD},
note={Survey Certification}
}

@misc{diversity,
  doi = {10.48550/ARXIV.2208.01545},
  url = {https://arxiv.org/abs/2208.01545},
  author = {Miranda, Brando and Yu, Patrick and Wang, Yu-Xiong and Koyejo, Sanmi},
  keywords = {Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},
  title = {The Curse of Low Task Diversity: On the Failure of Transfer Learning to Outperform MAML and Their Empirical Equivalence},
  publisher = {arXiv},
  year = {2022},
  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{nlp_task2vec,
  author    = {Tu Vu and
               Tong Wang and
               Tsendsuren Munkhdalai and
               Alessandro Sordoni and
               Adam Trischler and
               Andrew Mattarella{-}Micke and
               Subhransu Maji and
               Mohit Iyyer},
  title     = {Exploring and Predicting Transferability across {NLP} Tasks},
  journal   = {CoRR},
  volume    = {abs/2005.00770},
  year      = {2020},
  url       = {https://arxiv.org/abs/2005.00770},
  eprinttype = {arXiv},
  eprint    = {2005.00770},
  timestamp = {Fri, 08 May 2020 15:04:04 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-00770.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{data_dist,
  doi = {10.48550/ARXIV.2205.05055},

  url = {https://arxiv.org/abs/2205.05055},

  author = {Chan, Stephanie C. Y. and Santoro, Adam and Lampinen, Andrew K. and Wang, Jane X. and Singh, Aaditya and Richemond, Pierre H. and McClelland, Jay and Hill, Felix},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Data Distributional Properties Drive Emergent In-Context Learning in Transformers},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@article{task2vec,
  author    = {Alessandro Achille and
               Michael Lam and
               Rahul Tewari and
               Avinash Ravichandran and
               Subhransu Maji and
               Charless C. Fowlkes and
               Stefano Soatto and
               Pietro Perona},
  title     = {Task2Vec: Task Embedding for Meta-Learning},
  journal   = {CoRR},
  volume    = {abs/1902.03545},
  year      = {2019},
  url       = {http://arxiv.org/abs/1902.03545},
  eprinttype = {arXiv},
  eprint    = {1902.03545},
  timestamp = {Thu, 31 Oct 2019 16:31:22 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1902-03545.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{c4,
    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
    journal = {arXiv e-prints},
    year = {2019},
    archivePrefix = {arXiv},
    eprint = {1910.10683},
}

@InProceedings{bookcorpus,
    title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
    author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {December},
    year = {2015}
}

@misc{wikitext,
      title={Pointer Sentinel Mixture Models},
      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
      year={2016},
      eprint={1609.07843},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{term,
  doi = {10.48550/ARXIV.2202.07206},

  url = {https://arxiv.org/abs/2202.07206},

  author = {Razeghi, Yasaman and Logan, Robert L. and Gardner, Matt and Singh, Sameer},

  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Impact of Pretraining Term Frequencies on Few-Shot Reasoning},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@article{gpt2,
  title={Language Models are Unsupervised Multitask Learners},
  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  year={2019}
}

@article{gpt3,
  author    = {Tom B. Brown and
               Benjamin Mann and
               Nick Ryder and
               Melanie Subbiah and
               Jared Kaplan and
               Prafulla Dhariwal and
               Arvind Neelakantan and
               Pranav Shyam and
               Girish Sastry and
               Amanda Askell and
               Sandhini Agarwal and
               Ariel Herbert{-}Voss and
               Gretchen Krueger and
               Tom Henighan and
               Rewon Child and
               Aditya Ramesh and
               Daniel M. Ziegler and
               Jeffrey Wu and
               Clemens Winter and
               Christopher Hesse and
               Mark Chen and
               Eric Sigler and
               Mateusz Litwin and
               Scott Gray and
               Benjamin Chess and
               Jack Clark and
               Christopher Berner and
               Sam McCandlish and
               Alec Radford and
               Ilya Sutskever and
               Dario Amodei},
  title     = {Language Models are Few-Shot Learners},
  journal   = {CoRR},
  volume    = {abs/2005.14165},
  year      = {2020},
  url       = {https://arxiv.org/abs/2005.14165},
  eprinttype = {arXiv},
  eprint    = {2005.14165},
  timestamp = {Wed, 03 Jun 2020 11:36:54 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-14165.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@misc{thepile,
      title={The Pile: An 800GB Dataset of Diverse Text for Language Modeling},
      author={Leo Gao and Stella Biderman and Sid Black and Laurence Golding and Travis Hoppe and Charles Foster and Jason Phang and Horace He and Anish Thite and Noa Nabeshima and Shawn Presser and Connor Leahy},
      year={2020},
      eprint={2101.00027},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{broken,
  doi = {10.48550/ARXIV.2210.14891},

  url = {https://arxiv.org/abs/2210.14891},

  author = {Caballero, Ethan and Gupta, Kshitij and Rish, Irina and Krueger, David},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Broken Neural Scaling Laws},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@article{
wei2022emergent,
title={Emergent Abilities of Large Language Models},
author={Jason Wei and Yi Tay and Rishi Bommasani and Colin Raffel and Barret Zoph and Sebastian Borgeaud and Dani Yogatama and Maarten Bosma and Denny Zhou and Donald Metzler and Ed H. Chi and Tatsunori Hashimoto and Oriol Vinyals and Percy Liang and Jeff Dean and William Fedus},
journal={Transactions on Machine Learning Research},
year={2022},
url={https://openreview.net/forum?id=yzkSU5zdwD},
note={Survey Certification}
}

@misc{data_dist,
  doi = {10.48550/ARXIV.2205.05055},

  url = {https://arxiv.org/abs/2205.05055},

  author = {Chan, Stephanie C. Y. and Santoro, Adam and Lampinen, Andrew K. and Wang, Jane X. and Singh, Aaditya and Richemond, Pierre H. and McClelland, Jay and Hill, Felix},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Data Distributional Properties Drive Emergent In-Context Learning in Transformers},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@InProceedings{bookcorpus,
    title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
    author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {December},
    year = {2015}
}

@misc{term,
  doi = {10.48550/ARXIV.2202.07206},

  url = {https://arxiv.org/abs/2202.07206},

  author = {Razeghi, Yasaman and Logan, Robert L. and Gardner, Matt and Singh, Sameer},

  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Impact of Pretraining Term Frequencies on Few-Shot Reasoning},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{broken,
  doi = {10.48550/ARXIV.2210.14891},

  url = {https://arxiv.org/abs/2210.14891},

  author = {Caballero, Ethan and Gupta, Kshitij and Rish, Irina and Krueger, David},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Broken Neural Scaling Laws},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@article{
wei2022emergent,
title={Emergent Abilities of Large Language Models},
author={Jason Wei and Yi Tay and Rishi Bommasani and Colin Raffel and Barret Zoph and Sebastian Borgeaud and Dani Yogatama and Maarten Bosma and Denny Zhou and Donald Metzler and Ed H. Chi and Tatsunori Hashimoto and Oriol Vinyals and Percy Liang and Jeff Dean and William Fedus},
journal={Transactions on Machine Learning Research},
year={2022},
url={https://openreview.net/forum?id=yzkSU5zdwD},
note={Survey Certification}
}

@misc{data_dist,
  doi = {10.48550/ARXIV.2205.05055},

  url = {https://arxiv.org/abs/2205.05055},

  author = {Chan, Stephanie C. Y. and Santoro, Adam and Lampinen, Andrew K. and Wang, Jane X. and Singh, Aaditya and Richemond, Pierre H. and McClelland, Jay and Hill, Felix},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Data Distributional Properties Drive Emergent In-Context Learning in Transformers},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@InProceedings{bookcorpus,
    title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},
    author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
    booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
    month = {December},
    year = {2015}
}

@misc{term,
  doi = {10.48550/ARXIV.2202.07206},

  url = {https://arxiv.org/abs/2202.07206},

  author = {Razeghi, Yasaman and Logan, Robert L. and Gardner, Matt and Singh, Sameer},

  keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Impact of Pretraining Term Frequencies on Few-Shot Reasoning},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}

@misc{broken,
  doi = {10.48550/ARXIV.2210.14891},

  url = {https://arxiv.org/abs/2210.14891},

  author = {Caballero, Ethan and Gupta, Kshitij and Rish, Irina and Krueger, David},

  keywords = {Machine Learning (cs.LG), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Broken Neural Scaling Laws},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}


@InProceedings{tatsu,
  title = 	 {Model Performance Scaling with Multiple Data Sources},
  author =       {Hashimoto, Tatsunori},
  booktitle = 	 {Proceedings of the 38th International Conference on Machine Learning},
  pages = 	 {4107--4116},
  year = 	 {2021},
  editor = 	 {Meila, Marina and Zhang, Tong},
  volume = 	 {139},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {18--24 Jul},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v139/hashimoto21a/hashimoto21a.pdf},
  url = 	 {https://proceedings.mlr.press/v139/hashimoto21a.html},
  abstract = 	 {Real-world machine learning systems are often trained using a mix of data sources with varying cost and quality. Understanding how the size and composition of a training dataset affect model performance is critical for advancing our understanding of generalization, as well as designing more effective data collection policies. We show that there is a simple scaling law that predicts the loss incurred by a model even under varying dataset composition. Our work expands recent observations of scaling laws for log-linear generalization error in the i.i.d setting and uses this to cast model performance prediction as a learning problem. Using the theory of optimal experimental design, we derive a simple rational function approximation to generalization error that can be fitted using a few model training runs. Our approach can achieve highly accurate ($r^2\approx .9$) predictions of model performance under substantial extrapolation in two different standard supervised learning tasks and is accurate ($r^2 \approx .83$) on more challenging machine translation and question answering tasks where many baselines achieve worse-than-random performance.}
}

@generic{Kirsch2022,
   abstract = {Modern machine learning requires system designers to specify aspects of the learning pipeline, such as losses, architectures, and optimizers. Meta-learning, or learning-to-learn, instead aims to learn those aspects, and promises to unlock greater capabilities with less manual effort. One particularly ambitious goal of meta-learning is to train general-purpose learning algorithms from scratch, using only black box models with minimal inductive bias. Such a model takes in training data, and produces test-set predictions, without any explicit definition of an inference model, training loss, or optimization algorithm. In this paper we show that Transformers and other black-box models can be meta-trained to act as general-purpose in-context learners. We characterize phase transitions between algorithms that generalize, algorithms that memorize, and algorithms that fail to meta-train at all, induced by changes in model size, number of tasks, and meta-optimization. We further show that the capabilities of meta-trained algorithms are bottlenecked by the accessible state size (memory) determining the next prediction, unlike standard models which are thought to be bottlenecked by parameter count.},
   author = {Louis Kirsch and James Harrison and Jascha Sohl-Dickstein and Luke Metz},
   month = {11},
   title = {General-Purpose In-Context Learning by Meta-Learning Transformers},
   year = {2022},
}

@misc{demonstration,
  doi = {10.48550/ARXIV.2202.12837},

  url = {https://arxiv.org/abs/2202.12837},

  author = {Min, Sewon and Lyu, Xinxi and Holtzman, Ari and Artetxe, Mikel and Lewis, Mike and Hajishirzi, Hannaneh and Zettlemoyer, Luke},

  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}

@article{kaplan2020scaling,
  title={Scaling laws for neural language models},
  author={Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  journal={arXiv preprint arXiv:2001.08361},
  year={2020}
}

@article{hendrycks2020measuring,
  title={Measuring massive multitask language understanding},
  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  journal={arXiv preprint arXiv:2009.03300},
  year={2020}
}


@article{henighan2020scaling,
  title={Scaling laws for autoregressive generative modeling},
  author={Henighan, Tom and Kaplan, Jared and Katz, Mor and Chen, Mark and Hesse, Christopher and Jackson, Jacob and Jun, Heewoo and Brown, Tom B and Dhariwal, Prafulla and Gray, Scott and others},
  journal={arXiv preprint arXiv:2010.14701},
  year={2020}
}


@inproceedings{clark2022unified,
  title={Unified scaling laws for routed language models},
  author={Clark, Aidan and De Las Casas, Diego and Guy, Aurelia and Mensch, Arthur and Paganini, Michela and Hoffmann, Jordan and Damoc, Bogdan and Hechtman, Blake and Cai, Trevor and Borgeaud, Sebastian and others},
  booktitle={International Conference on Machine Learning},
  pages={4057--4086},
  year={2022},
  organization={PMLR}
}


@article{hernandez2021scaling,
  title={Scaling laws for transfer},
  author={Hernandez, Danny and Kaplan, Jared and Henighan, Tom and McCandlish, Sam},
  journal={arXiv preprint arXiv:2102.01293},
  year={2021}
}


@article{alabdulmohsin2022revisiting,
  title={Revisiting neural scaling laws in language and vision},
  author={Alabdulmohsin, Ibrahim and Neyshabur, Behnam and Zhai, Xiaohua},
  journal={arXiv preprint arXiv:2209.06640},
  year={2022}
}

@article{hoffmann2022training,
  title={Training compute-optimal large language models},
  author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others},
  journal={arXiv preprint arXiv:2203.15556},
  year={2022}
}


@article{anderson1972more,
  title={More is different: broken symmetry and the nature of the hierarchical structure of science.},
  author={Anderson, Philip W},
  journal={Science},
  volume={177},
  number={4047},
  pages={393--396},
  year={1972},
  publisher={American Association for the Advancement of Science}
}

@article{hestness2017deep,
  title={Deep learning scaling is predictable, empirically},
  author={Hestness, Joel and Narang, Sharan and Ardalani, Newsha and Diamos, Gregory and Jun, Heewoo and Kianinejad, Hassan and Patwary, Md and Ali, Mostofa and Yang, Yang and Zhou, Yanqi},
  journal={arXiv preprint arXiv:1712.00409},
  year={2017}
}


@article{rosenfeld2019constructive,
  title={A constructive prediction of the generalization error across scales},
  author={Rosenfeld, Jonathan S and Rosenfeld, Amir and Belinkov, Yonatan and Shavit, Nir},
  journal={arXiv preprint arXiv:1909.12673},
  year={2019}
}

@inproceedings{gordon2021data,
  title={Data and parameter scaling laws for neural machine translation},
  author={Gordon, Mitchell A and Duh, Kevin and Kaplan, Jared},
  booktitle={Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
  pages={5915--5922},
  year={2021}
}


@inproceedings{zhai2022scaling,
  title={Scaling vision transformers},
  author={Zhai, Xiaohua and Kolesnikov, Alexander and Houlsby, Neil and Beyer, Lucas},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={12104--12113},
  year={2022}
}


@article{neumann2022scaling,
  title={Scaling laws for a multi-agent reinforcement learning model},
  author={Neumann, Oren and Gros, Claudius},
  journal={arXiv preprint arXiv:2210.00849},
  year={2022}
}

@article{jones2021scaling,
  title={Scaling scaling laws with board games},
  author={Jones, Andy L},
  journal={arXiv preprint arXiv:2104.03113},
  year={2021}
}

@article{wei2022emergent,
  title={Emergent abilities of large language models},
  author={Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and others},
  journal={arXiv preprint arXiv:2206.07682},
  year={2022}
}


@inproceedings{ganguli2022predictability,
  title={Predictability and surprise in large generative models},
  author={Ganguli, Deep and Hernandez, Danny and Lovitt, Liane and Askell, Amanda and Bai, Yuntao and Chen, Anna and Conerly, Tom and Dassarma, Nova and Drain, Dawn and Elhage, Nelson and others},
  booktitle={2022 ACM Conference on Fairness, Accountability, and Transparency},
  pages={1747--1764},
  year={2022}
}


@article{steinhardt2022future,
  title={Future ml systems will be qualitatively different},
  author={Steinhardt, Jacob},
  url={https://bounded-regret.ghost.io/future-ml-systems-will-be-qualitatively-different/},
  year={2022}
}


@article{hendrycks2022emergent,
  title={Detecting Emergent Behavior},
  author={Hendrycks, Dan},
  url={https://www.youtube.com/watch?v=_4qrAck4q18},
  year={2022}
}

@article{krakovna2022sharp1,
  title={Refining the Sharp Left Turn threat model, part 1: claims and mechanisms},
  author={Krakovna, Victoria and Varma, Vikrant and Kumar, Ramana and Phuong, Mary},
  url={https://www.alignmentforum.org/posts/usKXS5jGDzjwqv3FJ/refining-the-sharp-left-turn-threat-model-part-1-claims-and},
  year={2022}
}


@article{krakovna2022sharp2,
  title={Refining the Sharp Left Turn threat model, part 2: applying alignment techniques},
  author={Krakovna, Victoria and Varma, Vikrant and Kumar, Ramana and Phuong, Mary},
  url={https://www.alignmentforum.org/posts/dfXwJh4X5aAcS8gF5/refining-the-sharp-left-turn-threat-model-part-2-applying},
  year={2022}
}

@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal={Advances in neural information processing systems},
  volume={33},
  pages={1877--1901},
  year={2020}
}

@article{rae2021scaling,
  title={Scaling language models: Methods, analysis \& insights from training gopher},
  author={Rae, Jack W and Borgeaud, Sebastian and Cai, Trevor and Millican, Katie and Hoffmann, Jordan and Song, Francis and Aslanides, John and Henderson, Sarah and Ring, Roman and Young, Susannah and others},
  journal={arXiv preprint arXiv:2112.11446},
  year={2021}
}

@article{thoppilan2022lamda,
  title={Lamda: Language models for dialog applications},
  author={Thoppilan, Romal and De Freitas, Daniel and Hall, Jamie and Shazeer, Noam and Kulshreshtha, Apoorv and Cheng, Heng-Tze and Jin, Alicia and Bos, Taylor and Baker, Leslie and Du, Yu and others},
  journal={arXiv preprint arXiv:2201.08239},
  year={2022}
}

@article{chowdhery2022palm,
  title={Palm: Scaling language modeling with pathways},
  author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  journal={arXiv preprint arXiv:2204.02311},
  year={2022}
}

@article{srivastava2022beyond,
  title={Beyond the imitation game: Quantifying and extrapolating the capabilities of language models},
  author={Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adri{\`a} and others},
  journal={arXiv preprint arXiv:2206.04615},
  year={2022}
}


@article{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}


@article{caballero2022broken,
  title={Broken Neural Scaling Laws},
  author={Caballero, Ethan and Gupta, Kshitij and Rish, Irina and Krueger, David},
  journal={arXiv preprint arXiv:2210.14891},
  year={2022}
}

@article{perez2022wemay,
  title={We may be able to see sharp left turns coming},
  author={Perez, Ethan and Nanda, Neel},
  url={https://www.alignmentforum.org/posts/2AvX8cX47CdwjbkjY/we-may-be-able-to-see-sharp-left-turns-coming},
  year={2022}
}

@article{lowe2022instruct,
  title={Aligning language models to follow instructions},
  author={Lowe, Ryan and Leike, Jan},
  url={https://openai.com/research/instruction-following},
  year={2022}
}

@article{lecun1998mnist,
  title={The MNIST database of handwritten digits},
  author={LeCun, Yann},
  journal={http://yann. lecun. com/exdb/mnist/},
  year={1998}
}

@article{wei2022bigbench,
  title={137 emergent abilities of large language models},
  author={Wei, Jason},
  url={https://www.jasonwei.net/blog/emergence},
  year={2022}
}


@TECHREPORT{krizhevsky09learningmultiple,
    author = {Alex Krizhevsky},
    title = {Learning multiple layers of features from tiny images},
    institution = {},
    year = {2009}
}


@inproceedings{chan2022data,
  title={Data distributional properties drive emergent in-context learning in transformers},
  author={Chan, Stephanie CY and Santoro, Adam and Lampinen, Andrew Kyle and Wang, Jane X and Singh, Aaditya K and Richemond, Pierre Harvey and McClelland, James and Hill, Felix},
  booktitle={Advances in Neural Information Processing Systems},
  year={2022}
}

@article{lake2015human,
  title={Human-level concept learning through probabilistic program induction},
  author={Lake, Brenden M and Salakhutdinov, Ruslan and Tenenbaum, Joshua B},
  journal={Science},
  volume={350},
  number={6266},
  pages={1332--1338},
  year={2015},
  publisher={American Association for the Advancement of Science}
}


@article{lecun1998gradient,
  title={Gradient-based learning applied to document recognition},
  author={LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and Haffner, Patrick},
  journal={Proceedings of the IEEE},
  volume={86},
  number={11},
  pages={2278--2324},
  year={1998},
  publisher={Ieee}
}

@inproceedings{lin2004rouge,
  title={Rouge: A package for automatic evaluation of summaries},
  author={Lin, Chin-Yew},
  booktitle={Text summarization branches out},
  pages={74--81},
  year={2004}
}

@article{brier1950verification,
  title={Verification of forecasts expressed in terms of probability},
  author={Brier, Glenn W and others},
  journal={Monthly weather review},
  volume={78},
  number={1},
  pages={1--3},
  year={1950}
}

@article{gpt4,
   author = {OpenAI},
   title = {GPT-4 Technical Report},
   year={2023}
}


@misc{michaud2023quantization,
      title={The Quantization Model of Neural Scaling},
      author={Eric J. Michaud and Ziming Liu and Uzay Girit and Max Tegmark},
      year={2023},
      eprint={2303.13506},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{nostalgebraist2022chinchilla,
  title={Chinchilla's wild implications},
  author={Nostalgebraist},
  journal={AI Alignment Forum},
  year={2022}
}

@techreport{google2023palm2,
  title={PaLM 2 technical report},
  author={Google},
  year={2023},
  note={URL https://ai.google/static/documents/palm2techreport.pdf}
}

@article{longpre2023pretrainer,
  title={A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, \& Toxicity},
  author={Longpre, S and Yauney, G and Reif, E and Lee, K and Roberts, A and Zoph, B and Zhou, D and Wei, J and Robinson, K and Mimno, D and Ippolito, D},
  journal={arXiv preprint arXiv:2305.13169},
  year={2023},
  note={URL https://doi.org/10.48550/arXiv.2305.13169}
}

@inproceedings{david2010impossibility,
  title={Impossibility Theorems for Domain Adaptation},
  author={David, S. B and Lu, T and Luu, T and Pal, D},
  booktitle={Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics},
  pages={129--136},
  year={2010},
  url={https://proceedings.mlr.press/v9/david10a.html}
}

@article{curse_low_div,
  author = {Brando Miranda and Patrick Yu and Yu-Xiong Wang and Sanmi Koyejo},
  title = {{The Curse of Low Task Diversity: On the Failure of Transfer Learning to Outperform MAML and Their Empirical Equivalence}},
  year = {2022},
  archivePrefix = {arXiv},
  arxivId = {2208.01545},
  doi = {10.48550/arXiv.2208.01545},
  eprint = {2208.01545},
  journal = {arXiv},
  url = {https://arxiv.org/abs/2208.01545}
}

@article{longpre2023pretrainers,
  title={A Pretrainer’s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity},
  author={Longpre, S. and Yauney, G. and Reif, E. and Lee, K. and Roberts, A. and Zoph, B. and Zhou, D. and Wei, J. and Robinson, K. and Mimno, D. and Ippolito, D.},
  journal={arXiv preprint arXiv:2305.13169},
  year={2023},
  url={https://doi.org/10.48550/arXiv.2305.13169}
}

@misc{friedman2022vendi,
      title={The Vendi Score: A Diversity Evaluation Metric for Machine Learning},
      author={Dan Friedman and Adji Bousso Dieng},
      year={2022},
      eprint={2210.02410},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{fowl2020random,
      title={Random Network Distillation as a Diversity Metric for Both Image and Text Generation},
      author={Liam Fowl and Micah Goldblum and Arjun Gupta and Amr Sharaf and Tom Goldstein},
      year={2020},
      eprint={2010.06715},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}


@InProceedings{naeem2020,
  title = 	 {Reliable Fidelity and Diversity Metrics for Generative Models},
  author =       {Naeem, Muhammad Ferjad and Oh, Seong Joon and Uh, Youngjung and Choi, Yunjey and Yoo, Jaejun},
  booktitle = 	 {Proceedings of the 37th International Conference on Machine Learning},
  pages = 	 {7176--7185},
  year = 	 {2020},
  editor = 	 {III, Hal Daumé and Singh, Aarti},
  volume = 	 {119},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {13--18 Jul},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v119/naeem20a/naeem20a.pdf},
  url = 	 {https://proceedings.mlr.press/v119/naeem20a.html},
  abstract = 	 {Devising indicative evaluation metrics for the image generation task remains an open problem. The most widely used metric for measuring the similarity between real and generated images has been the Frechet Inception Distance (FID) score. Since it does not differentiate the fidelity and diversity aspects of the generated images, recent papers have introduced variants of precision and recall metrics to diagnose those properties separately. In this paper, we show that even the latest version of the precision and recall metrics are not reliable yet. For example, they fail to detect the match between two identical distributions, they are not robust against outliers, and the evaluation hyperparameters are selected arbitrarily. We propose density and coverage metrics that solve the above issues. We analytically and experimentally show that density and coverage provide more interpretable and reliable signals for practitioners than the existing metrics.}
}

@misc{sajjadi2018assessing,
      title={Assessing Generative Models via Precision and Recall},
      author={Mehdi S. M. Sajjadi and Olivier Bachem and Mario Lucic and Olivier Bousquet and Sylvain Gelly},
      year={2018},
      eprint={1806.00035},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{simon2019revisiting,
      title={Revisiting Precision and Recall Definition for Generative Model Evaluation},
      author={Loïc Simon and Ryan Webster and Julien Rabin},
      year={2019},
      eprint={1905.05441},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{kynkäänniemi2019improved,
      title={Improved Precision and Recall Metric for Assessing Generative Models},
      author={Tuomas Kynkäänniemi and Tero Karras and Samuli Laine and Jaakko Lehtinen and Timo Aila},
      year={2019},
      eprint={1904.06991},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@article{bert,
archivePrefix = {arXiv},
arxivId = {1810.04805},
author = {Devlin, Jacob and Chang, Ming Wei and Lee, Kenton and Toutanova, Kristina},
eprint = {1810.04805},
isbn = {9781950737130},
journal = {NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference},
month = {oct},
pages = {4171--4186},
publisher = {Association for Computational Linguistics (ACL)},
title = {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}},
url = {https://arxiv.org/abs/1810.04805v2},
volume = {1},
year = {2018}
}

@article{alexnet,
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
title = {{ImageNet Classification with Deep Convolutional Neural Networks}},
url = {http://code.google.com/p/cuda-convnet/},
year = {2012}
}

@article{resnet,
archivePrefix = {arXiv},
arxivId = {1512.03385},
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
doi = {10.1109/CVPR.2016.90},
eprint = {1512.03385},
isbn = {9781467388504},
issn = {10636919},
journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
month = {dec},
pages = {770--778},
publisher = {IEEE Computer Society},
title = {{Deep Residual Learning for Image Recognition}},
url = {https://arxiv.org/abs/1512.03385v1},
volume = {2016-December},
year = {2015}
}

@article{EfficientZero,
archivePrefix = {arXiv},
arxivId = {2111.00210},
author = {Ye, Weirui and Liu, Shaohuai and Kurutach, Thanard and Abbeel, Pieter and Gao, Yang and University, Tsinghua and Berkeley, U C and Qi, Shanghai and Institute, Zhi},
eprint = {2111.00210},
month = {oct},
title = {{Mastering Atari Games with Limited Data}},
url = {https://arxiv.org/abs/2111.00210v1},
year = {2021}
}

@article{atari1,
archivePrefix = {arXiv},
arxivId = {1312.5602v1},
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
eprint = {1312.5602v1},
title = {{Playing Atari with Deep Reinforcement Learning}},
year = {2013}
}

@article{alphago,
author = {Silver, David and Huang, Aja and Maddison, Chris J. and Guez, Arthur and Sifre, Laurent and {Van Den Driessche}, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and Dieleman, Sander and Grewe, Dominik and Nham, John and Kalchbrenner, Nal and Sutskever, Ilya and Lillicrap, Timothy and Leach, Madeleine and Kavukcuoglu, Koray and Graepel, Thore and Hassabis, Demis},
doi = {10.1038/nature16961},
issn = {1476-4687},
journal = {Nature 2016 529:7587},
month = {jan},
number = {7587},
pages = {484--489},
pmid = {26819042},
publisher = {Nature Publishing Group},
title = {{Mastering the game of Go with deep neural networks and tree search}},
url = {https://www.nature.com/articles/nature16961},
volume = {529},
year = {2016}
}


@article{codex,
archivePrefix = {arXiv},
arxivId = {2107.03374v2},
author = {Chen, Mark and Tworek, Jerry and Jun, Heewoo and Yuan, Qiming and {Ponde de Oliveira Pinto}, Henrique and Kaplan, Jared and Edwards, Harri and Burda, Yuri and Joseph, Nicholas and Brockman, Greg and Ray, Alex and Puri, Raul and Krueger, Gretchen and Petrov, Michael and Khlaaf, Heidy and Sastry, Girish and Mishkin, Pamela and Chan, Brooke and Gray, Scott and Ryder, Nick and Pavlov, Mikhail and Power, Alethea and Kaiser, Lukasz and Bavarian, Mohammad and Winter, Clemens and Tillet, Philippe and {Petroski Such}, Felipe and Cummings, Dave and Plappert, Matthias and Chantzis, Fotios and Barnes, Elizabeth and Herbert-Voss, Ariel and {Hebgen Guss}, William and Nichol, Alex and Paino, Alex and Tezak, Nikolas and Tang, Jie and Babuschkin, Igor and Balaji, Suchir and Jain, Shantanu and Saunders, William and Hesse, Christopher and Carr, Andrew N and Leike, Jan and Achiam, Josh and Misra, Vedant and Morikawa, Evan and Radford, Alec and Knight, Matthew and Brundage, Miles and Murati, Mira and Mayer, Katie and Welinder, Peter and McGrew, Bob and Amodei, Dario and McCandlish, Sam and Sutskever, Ilya and Zaremba, Wojciech},
eprint = {2107.03374v2},
title = {{Evaluating Large Language Models Trained on Code}},
url = {https://www.github.com/openai/human-eval.}
}

@techreport{skiptree,
archivePrefix = {arXiv},
arxivId = {2006.04757v3},
author = {Rabe, Markus N and Research, Google and Lee, Dennis and Bansal, Kshitij and Szegedy, Christian},
eprint = {2006.04757v3},
title = {{Mathematical Reasoning via Self-supervised Skip-tree Training}}
}

@article{gptf,
author = {Polu, Stanislas and Sutskever, Ilya},
eprint = {2009.03393},
month = {sep},
title = {{Generative Language Modeling for Automated Theorem Proving}},
url = {http://arxiv.org/abs/2009.03393},
year = {2020}
}

@report{pact,
   author = {Jesse Michael Han and Jason Rute and Yuhuai Wu and Edward W Ayers and Stanislas Polu},
   isbn = {2102.06203v1},
   title = {Proof Artifact Co-training for Theorem Proving with Language Models},
}


@article{clip,
archivePrefix = {arXiv},
arxivId = {2103.00020},
author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
eprint = {2103.00020},
month = {feb},
title = {{Learning Transferable Visual Models From Natural Language Supervision}},
url = {https://arxiv.org/abs/2103.00020v1},
year = {2021}
}

@misc{llama,
      title={LLaMA: Open and Efficient Foundation Language Models},
      author={Hugo Touvron and Thibaut Lavril and Gautier Izacard and Xavier Martinet and Marie-Anne Lachaux and Timothée Lacroix and Baptiste Rozière and Naman Goyal and Eric Hambro and Faisal Azhar and Aurelien Rodriguez and Armand Joulin and Edouard Grave and Guillaume Lample},
      year={2023},
      eprint={2302.13971},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{schaeffer2023emergent,
      title={Are Emergent Abilities of Large Language Models a Mirage?},
      author={Rylan Schaeffer and Brando Miranda and Sanmi Koyejo},
      year={2023},
      eprint={2304.15004},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@article{zhang2021understanding,
  title={Understanding deep learning (still) requires rethinking generalization},
  author={Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  journal={Communications of the ACM},
  volume={64},
  number={3},
  pages={107--115},
  year={2021},
  url={https://doi.org/10.1145/3446776}
}

@article{ginc,
  author    = {Sang Michael Xie and
               Aditi Raghunathan and
               Percy Liang and
               Tengyu Ma},
  title     = {An Explanation of In-context Learning as Implicit Bayesian Inference},
  journal   = {CoRR},
  volume    = {abs/2111.02080},
  year      = {2021},
  url       = {https://arxiv.org/abs/2111.02080},
  eprinttype = {arXiv},
  eprint    = {2111.02080},
  timestamp = {Fri, 05 Nov 2021 15:25:54 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-02080.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{miranda2023pretraining,
  title={Is Pre-training Truly Better Than Meta-Learning?},
  author={Miranda, B. and Yu, P. and Goyal, S. and Wang, Y.-X. and Koyejo, S.},
  journal={arXiv preprint arXiv:2306.13841},
  year={2023},
  note={\url{https://doi.org/10.48550/arXiv.2306.13841}}
}

@online{ruiz2023train,
  author={Ruiz, Armand},
  title={What it Takes to Train a Foundation Model},
  year={2023},
  month={7},
  day={15},
  url={https://www.nocode.ai/what-it-takes-to-train-a-foundation-model/},
  note={Director of Data Science at IBM and the founder of NoCode.ai}
}

@misc{cerebras2023slimpajama,
author = {Soboleva, Daria and Al-Khateeb, Faisal and Myers, Robert and Steeves, Jacob R and Hestness, Joel and Dey, Nolan},
title = {{SlimPajama: A 627B token cleaned and deduplicated version of RedPajama}},
month = June,
year = 2023,
howpublished = {\url{https://www.cerebras.net/blog/slimpajama-a-627b-token-cleaned-and-deduplicated-version-of-redpajama}},
url = {https://huggingface.co/datasets/cerebras/SlimPajama-627B},
}

@misc{tirumala2023d4,
      title={D4: Improving LLM Pretraining via Document De-Duplication and Diversification},
      author={Kushal Tirumala and Daniel Simig and Armen Aghajanyan and Ari S. Morcos},
      year={2023},
      eprint={2308.12284},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{xie2023doremi,
      title={DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining},
      author={Sang Michael Xie and Hieu Pham and Xuanyi Dong and Nan Du and Hanxiao Liu and Yifeng Lu and Percy Liang and Quoc V. Le and Tengyu Ma and Adams Wei Yu},
      year={2023},
      eprint={2305.10429},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{nguyen2019understanding,
      title={Toward Understanding Catastrophic Forgetting in Continual Learning},
      author={Cuong V. Nguyen and Alessandro Achille and Michael Lam and Tal Hassner and Vijay Mahadevan and Stefano Soatto},
      year={2019},
      eprint={1908.01091},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{edwards2017neural,
      title={Towards a Neural Statistician},
      author={Harrison Edwards and Amos Storkey},
      year={2017},
      eprint={1606.02185},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

@misc{xie2022explanation,
      title={An Explanation of In-context Learning as Implicit Bayesian Inference},
      author={Sang Michael Xie and Aditi Raghunathan and Percy Liang and Tengyu Ma},
      year={2022},
      eprint={2111.02080},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{touvron2023llama,
      title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
      author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
      year={2023},
      eprint={2307.09288},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{shin2022effect,
      title={On the Effect of Pretraining Corpora on In-context Learning by a Large-scale Language Model},
      author={Seongjin Shin and Sang-Woo Lee and Hwijeen Ahn and Sungdong Kim and HyoungSeok Kim and Boseop Kim and Kyunghyun Cho and Gichang Lee and Woomyoung Park and Jung-Woo Ha and Nako Sung},
      year={2022},
      eprint={2204.13509},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{eldan2023tinystories,
      title={TinyStories: How Small Can Language Models Be and Still Speak Coherent English?},
      author={Ronen Eldan and Yuanzhi Li},
      year={2023},
      eprint={2305.07759},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{gunasekar2023textbooks,
      title={Textbooks Are All You Need},
      author={Suriya Gunasekar and Yi Zhang and Jyoti Aneja and Caio César Teodoro Mendes and Allie Del Giorno and Sivakanth Gopi and Mojan Javaheripi and Piero Kauffmann and Gustavo de Rosa and Olli Saarikivi and Adil Salim and Shital Shah and Harkirat Singh Behl and Xin Wang and Sébastien Bubeck and Ronen Eldan and Adam Tauman Kalai and Yin Tat Lee and Yuanzhi Li},
      year={2023},
      eprint={2306.11644},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{li2023textbooks,
      title={Textbooks Are All You Need II: phi-1.5 technical report},
      author={Yuanzhi Li and Sébastien Bubeck and Ronen Eldan and Allie Del Giorno and Suriya Gunasekar and Yin Tat Lee},
      year={2023},
      eprint={2309.05463},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@INPROCEEDINGS{9577374,
  author={Wallace, Bram and Wu, Ziyang and Hariharan, Bharath},
  booktitle={2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  title={Can We Characterize Tasks Without Labels or Features?},
  year={2021},
  volume={},
  number={},
  pages={1245-1254},
  keywords={Computer vision;Codes;Shape;Computational modeling;Natural languages;Pattern recognition;Task analysis},
  doi={10.1109/CVPR46437.2021.00130}}


@misc{xie2023data,
      title={Data Selection for Language Models via Importance Resampling},
      author={Sang Michael Xie and Shibani Santurkar and Tengyu Ma and Percy Liang},
      year={2023},
      eprint={2302.03169},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}


@techreport{svcca,
author = {Raghu, Maithra and Gilmer, Justin and Yosinski, Jason and Sohl-Dickstein, Jascha},
booktitle = {Advances in Neural Information Processing Systems},
pages = {6076--6085},
title = {{SVCCA: Singular Vector Canonical Correlation Analysis for Deep Learning Dynamics and Interpretability}},
volume = {30},
year = {2017}
}


@techreport{pwcca,
author = {Morcos, Ari S. and Raghu, Maithra and Bengio, Samy},
title = {{Insights on representational similarity in neural networks with canonical correlation}},
year={2018}
}

@article{opd,
archivePrefix = {arXiv},
arxivId = {2108.01661v1},
author = {Ding, Frances and Denain, Jean-Stanislas and Steinhardt, Jacob},
eprint = {2108.01661v1},
title = {{Grounding Representation Similarity with Statistical Testing}},
url = {https://github.com/js-d/sim_metric.},
year={2021}
}

@techreport{cka,
author = {Kornblith, Simon and Norouzi, Mohammad and Lee, Honglak and Hinton, Geoffrey},
issn = {2640-3498},
keywords = {ICML,Machine Learning},
mendeley-groups = {Meta-Learning},
month = {may},
pages = {3519--3529},
publisher = {PMLR},
title = {{Similarity of Neural Network Representations Revisited}},
url = {http://proceedings.mlr.press/v97/kornblith19a.html},
year = {2019}
}

@misc{mirage,
      title={Are Emergent Abilities of Large Language Models a Mirage?},
      author={Rylan Schaeffer and Brando Miranda and Sanmi Koyejo},
      year={2023},
      eprint={2304.15004},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}

@misc{contamination,
      title={Investigating Data Contamination for Pre-training Language Models},
      author={Minhao Jiang and Ken Ziyu Liu and Ming Zhong and Rylan Schaeffer and Siru Ouyang and Jiawei Han and Sanmi Koyejo},
      year={2024},
      eprint={2401.06059},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{dsir,
      title={Data Selection for Language Models via Importance Resampling},
      author={Sang Michael Xie and Shibani Santurkar and Tengyu Ma and Percy Liang},
      year={2023},
      eprint={2302.03169},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inbook{Bengtsson_2008,
   title={Curse-of-dimensionality revisited: Collapse of the particle filter in very large scale systems},
   url={http://dx.doi.org/10.1214/193940307000000518},
   DOI={10.1214/193940307000000518},
   booktitle={Probability and Statistics: Essays in Honor of David A. Freedman},
   publisher={Institute of Mathematical Statistics},
   author={Bengtsson, Thomas and Bickel, Peter and Li, Bo},
   year={2008},
   pages={316–334} }

@book{GelmanMeng2004,
  title={Applied Bayesian Modeling and Causal Inference from Incomplete-Data Perspectives},
  author={Gelman, Andrew and Meng, Xiao-Li},
  year={2004},
  publisher={Wiley},
  series={Wiley Series in Probability and Statistics}
}

@inproceedings{SnyderEtAl2008,
  title={Obstacles to High-Dimensional Particle Filtering},
  author={Snyder, Chris and Bengtsson, Thomas and Bickel, Peter and Anderson, Jeff},
  booktitle={Mathematical Advances in Data Assimilation (MADA)},
  year={2008}
}

@inproceedings{AchilleMbengSoatto2018,
  title={The Dynamic Distance Between Learning Tasks: From Kolmogorov Complexity to Transfer Learning via Quantum Physics and the Information Bottleneck of the Weights of Deep Networks},
  author={Achille, Alessandro and Mbeng, Glen Bigan and Soatto, Stefano},
  booktitle={NeurIPS Workshop on Integration of Deep Learning Theories},
  year={2018},
  month={Dec},
  day={8}
}

@misc{achille2019dynamics,
      title={Dynamics and Reachability of Learning Tasks},
      author={Alessandro Achille and Glen Mbeng and Stefano Soatto},
      year={2019},
      eprint={1810.02440},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@article{AchillePaoliniMbengSoatto2021,
  title={The Information Complexity of Learning Tasks, Their Structure and Their Distance},
  author={Achille, Alessandro and Paolini, Giovanni and Mbeng, Glen and Soatto, Stefano},
  journal={Information and Inference: A Journal of the IMA},
  volume={10},
  pages={51--72},
  year={2021},
  doi={10.1093/imaiai/iaaa033},
}

@misc{llamav2,
      title={Llama 2: Open Foundation and Fine-Tuned Chat Models},
      author={Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic and Sergey Edunov and Thomas Scialom},
      year={2023},
      eprint={2307.09288},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@misc{abbas2024effective,
      title={Effective pruning of web-scale datasets based on complexity of concept clusters},
      author={Amro Abbas and Evgenia Rusak and Kushal Tirumala and Wieland Brendel and Kamalika Chaudhuri and Ari S. Morcos},
      year={2024},
      eprint={2401.04578},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{abbas2023semdedup,
      title={SemDeDup: Data-efficient learning at web-scale through semantic deduplication},
      author={Amro Abbas and Kushal Tirumala and Dániel Simig and Surya Ganguli and Ari S. Morcos},
      year={2023},
      eprint={2303.09540},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{sorscher2023neural,
      title={Beyond neural scaling laws: beating power law scaling via data pruning},
      author={Ben Sorscher and Robert Geirhos and Shashank Shekhar and Surya Ganguli and Ari S. Morcos},
      year={2023},
      eprint={2206.14486},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{mitchell2023measuring,
      title={Measuring Data},
      author={Margaret Mitchell and Alexandra Sasha Luccioni and Nathan Lambert and Marissa Gerchick and Angelina McMillan-Major and Ezinwanne Ozoani and Nazneen Rajani and Tristan Thrush and Yacine Jernite and Douwe Kiela},
      year={2023},
      eprint={2212.05129},
      archivePrefix={arXiv},
      primaryClass={cs.AI}
}