Train DeBERTa V3 with jax/flax
@misc{he2021debertav3,
title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
year={2021},
eprint={2111.09543},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@inproceedings{clark2020electra,
title={{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators},
author={Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
booktitle={ICLR},
year={2020},
url={https://openreview.net/pdf?id=r1xMH1BtvB}
}
@misc{electra_pytorch,
title={PyTorch implementation of ELECTRA},
author={Richard Wang},
year={2020},
publisher={GitHub},
journal={GitHub repository},
howpublished={\url{https://github.com/richarddwang/electra_pytorch}}
}
@misc{sahajBERT,
title={sahajBERT},
author={tanmoyio},
year={2021},
publisher={GitHub},
journal={GitHub repository},
howpublished={\url{https://github.com/tanmoyio/sahajbert}}
}