Skip to content

Commit

Permalink
hooks : setting up flake8 and pre-commit hooks (ggerganov#1681)
Browse files Browse the repository at this point in the history
Small, non-functional changes were made to non-compliant files.
These include breaking up long lines, whitespace sanitation and
unused import removal.

Maximum line length in python files was set to a generous 125 chars,
in order to minimize number of changes needed in scripts and general
annoyance. The "txt" prompts directory is excluded from the checks
as it may contain oddly formatted files and strings for a good reason.

Signed-off-by: Jiri Podivin <[email protected]>
  • Loading branch information
jpodivin authored Jun 17, 2023
1 parent bac1992 commit 5ddf7ea
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 12 deletions.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 125
15 changes: 15 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
26 changes: 18 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,11 @@ def validate_conversion_to(self, data_type: DataType) -> None:
if not isinstance(self.data_type, QuantizedDataType):
raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
if self.data_type.have_g_idx:
sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
sys.stderr.write(
"Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
"which is not yet natively supported by GGML. "
"For now you can still convert this model by passing `--outtype f16` to dequantize, "
"but that will result in a much larger output file for no quality benefit.\n")
sys.exit(1)
assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends

Expand Down Expand Up @@ -694,8 +698,9 @@ def load(offset: int, elm_count: int) -> NDArray:
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
return LazyStorage(load=load, kind=pid[1], description=description)

# @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
# @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage)

Expand Down Expand Up @@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
# Use mmap for the actual data to avoid race conditions with the file offset.
off = fp.raw.tell()
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
fp.raw.seek(off) # needed on Windows
fp.raw.seek(off) # needed on Windows

def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
Expand Down Expand Up @@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
files = list(path.glob("model-00001-of-*.safetensors"))
if not files:
# Try the PyTorch patterns too, with lower priority
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try GGML too, but with lower priority, since if both a non-GGML
Expand Down Expand Up @@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
elif path3.exists():
path = path3
else:
raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
raise FileNotFoundError(
f"Could not find tokenizer.model in {path} or its parent; "
"if it's in another directory, pass the directory as --vocab-dir")
added_tokens_path = path.parent / "added_tokens.json"
print(f"Loading vocab file {path}")
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
Expand All @@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
}[params.file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
if ret in model_paths:
sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
sys.stderr.write(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n")
sys.exit(1)
return ret

Expand All @@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("model", type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
args = parser.parse_args(args_in)

vocab: Vocab
Expand Down
7 changes: 4 additions & 3 deletions examples/jeopardy/graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import matplotlib.pyplot as plt
import sys, os
import os
import csv

labels = []
Expand All @@ -8,6 +8,7 @@

rows = []


def bar_chart(numbers, labels, pos):
plt.bar(pos, numbers, color='blue')
plt.xticks(ticks=pos, labels=labels)
Expand All @@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
plt.ylabel("Questions Correct")
plt.show()


def calculatecorrect():
directory = os.fsencode("./examples/jeopardy/results/")
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
Expand All @@ -38,14 +40,13 @@ def calculatecorrect():
print(line)
else:
print("Correct answer: " + rows[i][2] + "\n")
i+=1
i += 1
print("Did the AI get the question right? (y/n)")
if input() == "y":
totalcorrect += 1
numbers.append(totalcorrect)



if __name__ == '__main__':
calculatecorrect()
pos = list(range(numEntries))
Expand Down
4 changes: 3 additions & 1 deletion scripts/verify-checksum-models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import hashlib


def sha256sum(file):
block_size = 16 * 1024 * 1024 # 16 MB block size
b = bytearray(block_size)
b = bytearray(block_size)
file_hash = hashlib.sha256()
mv = memoryview(b)
with open(file, 'rb', buffering=0) as f:
Expand All @@ -15,6 +16,7 @@ def sha256sum(file):

return file_hash.hexdigest()


# Define the path to the llama directory (parent folder of script directory)
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))

Expand Down

0 comments on commit 5ddf7ea

Please sign in to comment.