Training Compute-Optimal Large Language Models
Authors
Venue
NeurIPS 2022
Abstract
Shows that current LLMs are significantly undertrained. For compute-optimal training, model size and training tokens should scale equally. Introduces Chinchilla (70B params, 1.4T tokens) which outperforms larger models like Gopher (280B) trained on less data.
Tags
Links
BibTeX
Local Entry
@inproceedings{hoffmann2022chinchilla,
title = {Training Compute-Optimal Large Language Models},
author = {Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
year = {2022},
booktitle = {NeurIPS 2022},
url = {https://arxiv.org/abs/2203.15556},
abstract = {Shows that current LLMs are significantly undertrained. For compute-optimal training, model size and training tokens should scale equally. Introduces Chinchilla (70B params, 1.4T tokens) which outperforms larger models like Gopher (280B) trained on less data.}
} From OPENALEX
@inproceedings{hoffmann2022chinchilla,
title = {Training Compute-Optimal Large Language Models},
author = {Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
year = {2022},
booktitle = {arXiv (Cornell University)},
doi = {10.48550/arxiv.2203.15556}
}