-
Notifications
You must be signed in to change notification settings - Fork 32
/
create_manifest.py
112 lines (98 loc) · 5.04 KB
/
create_manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Script to create a manifest.txt file. The manifest.txt file
lists which (public) files should be copied from the source bucket
to the account specific FMBench bucket
"""
import os
import glob
from typing import List
from pathlib import Path
MANIFEST_FILE: str = "manifest.txt"
MANIFEST_MD_FILE: str = "manifest.md"
BASE_FILE_LIST: List[str] = ["prompt_template/.keep",
"tokenizer/.keep",
"llama2_tokenizer/.keep",
"llama3_tokenizer/.keep",
"llama3_1_tokenizer/.keep",
"llama3_2_tokenizer/.keep",
"mistral_tokenizer/.keep",
"phi_tokenizer/.keep",
"scripts/.keep",
"configs/pricing.yml",
"configs/pricing_fallback.yml",
# add new datasets here
"source_data/2wikimqa_e.jsonl",
"source_data/2wikimqa.jsonl",
"source_data/hotpotqa_e.jsonl",
"source_data/hotpotqa.jsonl",
"source_data/narrativeqa.jsonl",
"source_data/triviaqa_e.jsonl",
"source_data/triviaqa.jsonl",
"source_data/just_text.jsonl",
"source_data/500_token_prompts_synthetic_data.jsonl",
"source_data/synthetic_data_large_prompts.jsonl",
"source_data/LICENSE.txt",
"source_data/THIRD_PARTY_LICENSES.txt"]
import subprocess
import re
def get_tree_output(directory='.'):
"""Get the output of the `tree` command for a given directory."""
result = subprocess.run(['tree', '-f', directory], capture_output=True, text=True)
return result.stdout
def convert_to_markdown_links(tree_output, directory):
"""Convert tree command output to Markdown hyperlinks."""
lines = tree_output.splitlines()
markdown_links = []
# Regex to match file paths by excluding tree structure characters
path_pattern = re.compile(r'\s*(?:├──|└──|─|│)?\s*(.*)')
for line in lines:
match = path_pattern.match(line)
if match:
path = match.group(1).strip()
if path:
# Format the path as a Markdown hyperlink
path_link = path.replace('├── ', '')\
.replace('└──', '')\
.replace('│ ', '')\
.strip()
path_readable = path.replace(f"{directory}/", '')
# if the path is adirectory then we dont want to put a link for it
# because we dont support directory listing, it will just return a broken
# page
from pathlib import Path
if Path(path).suffix == '':
markdown_link = f"**{path_readable}** "
else:
markdown_link = f'[{path_readable}]({path_link}) '
markdown_links.append(markdown_link)
# remove the first line, it is just name of the configs directory
# remove the last line, it is the number of files and directories
# like: [14 directories, 69 files](14 directories, 69 files)
return '\n'.join(markdown_links[1:-1])
def create_dir_listing_as_markdown(directory):
tree_output = get_tree_output(directory)
markdown_links = convert_to_markdown_links(tree_output, directory)
preamble = """Here is a listing of the various configuration files available out-of-the-box with `FMBench`. Click on any link to view a file. You can use these files as-is or use them as templates to create a custom configuration file for your use-case of interest.\n\n"""
Path(MANIFEST_MD_FILE).write_text(preamble + markdown_links)
def create_manifest_file(config_yml_dir):
config_yml_files = glob.glob(os.path.join(config_yml_dir, "**/*", "*.yml"),
recursive=True)
config_yml_files = [f.replace(os.path.join("src", "fmbench") + "/", "") for f in config_yml_files]
print(f"there are {len(config_yml_files)} config yml files")
# append them to the base list
all_manifest_files = config_yml_files + BASE_FILE_LIST
# sort so that diff between versions is easier to understand
all_manifest_files = sorted(all_manifest_files)
# and write to manifest.txt
written: int = Path(MANIFEST_FILE).write_text("\n".join([f for f in all_manifest_files]))
print(f"written {written} bytes to {MANIFEST_FILE}")
# all .yml files in the config directory need to be appended to the list above
config_yml_dir = os.path.join("src", "fmbench", "configs")
create_manifest_file(config_yml_dir)
# create the directory listing to put on the website
DOCS_DIR: str = "docs"
CONFIG_DIR_FOR_LISTING: str = os.path.join("configs")
cwd = os.getcwd()
os.chdir(DOCS_DIR)
create_dir_listing_as_markdown(CONFIG_DIR_FOR_LISTING)
os.chdir(cwd)