config.ini

# This configuration file follows the INI File Structure
# https://docs.python.org/3/library/configparser.html#supported-ini-file-structure

[DATA]
# Directory with Spacy Vocabulary files
Spacy Vocab Dir = ex_data/vocab
# Spacy KnowledgeBase file
Spacy KB = ex_data/kb
# File with CoNLL annotated with Wikidata QIDs
Conll Annotated = ex_data/conll/conll-wikidata-iob-annotations
# File with Wikidata QID to Wikipedia abstracts
Wikipedia Abstracts = ex_data/yi-chun/wikidata-wikipedia.tsv
# If you have run these scripts before, 
#  this is the file generated by the candidate generator
Candidate Info = data/docs_entities_info.json

[INPUT VECTORS]
Use Balanced Dataset = False
# Directory to read or write generated input vectors
Input Vectors Dir = data/vectors
# If Use Balanced Dataset = True:
#  Directory to read or write balanced dataset input vectors
Balanced Dataset Dir = data/balanced_dataset
#  Number of negative samples in balanced dataset (determines pos:neg-ratio)
N Negative Samples = 1

[BERT]
# Name of the model if fetched from Huggingface:
Model ID = bert-base-uncased
Max Sequence Length = 512
# Directory to read BERT model from
Bert Model Dir = models/bert_ned
# Directory to write trained models to
Save Model Dir = models/trained

[TRAINING]
# Epochs of training
Epochs = 5
# Number of encoder transformers to freeze before training (max 12)
Freeze N Transformers = 8
Batch Size = 24
# Use default document split of CoNLL dataset
Use Default Split = True
# If Use Default Split = False:
Training Set Size = 0.1
Validation Set Size = 0.1
Test Set Size = 0.1

[VERBOSITY]
# Frequency of progress updates during training in number of steps
Training Update Frequency = 100
Validation Update Frequency = 50
Test Update Frequency = 50