Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research
Authors
Venue
arXiv preprint arXiv:2402.00159
Tags
Links
BibTeX
Local Entry
@article{soldaini2024dolma,
title = {Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research},
author = {Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo},
year = {2024},
journal = {arXiv preprint arXiv:2402.00159},
url = {https://arxiv.org/abs/2402.00159}
} From OPENALEX
@article{soldaini2024dolma,
title = {Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research},
author = {Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and Ben Bogin and Khyathi Raghavi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Evan Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo},
year = {2024},
doi = {10.18653/v1/2024.acl-long.840}
}