-
Notifications
You must be signed in to change notification settings - Fork 9
/
done.py
110 lines (86 loc) · 2.73 KB
/
done.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from pathlib import Path
from glob import glob
import csv
import sys
from multiprocessing import Pool
from typing import List
from tqdm.auto import tqdm
import argtyped
from helpers import save_json, load_json
csv.field_size_limit(sys.maxsize)
TSV_FIELDNAMES = [
"listing_id",
"photo_id",
"image_w",
"image_h",
"vfov",
"features",
"boxes",
"cls_prob",
"attr_prob",
"featureViewIndex",
"featureHeading",
"featureElevation",
]
def search_locations(image_folder: Path) -> List[Path]:
return [f for f in image_folder.iterdir() if f.is_dir()]
def load_photo_paths(cache: Path) -> List[str]:
if not cache.is_file():
raise RuntimeError("Please cache paths first")
with open(cache, "r") as fid:
photos = [p.strip() for p in fid.readlines()]
return photos
def cache_photo_paths(image_folder: Path, cache: Path):
if cache.is_file():
return
locations = search_locations(image_folder)
photos = []
for location in tqdm(locations):
for photo in location.glob("*.jpg"):
photos.append(str(photo))
photo_id_to_path = {
int(Path(path.strip()).stem.split("-")[1]): path for path in tqdm(photos)
}
save_json(photo_id_to_path, cache)
class Arguments(argtyped.Arguments):
start: int = 0
num_workers: int = 1
num_splits: int = 1
images: Path = Path("images")
cache: Path = Path(".photo_id_to_path.json")
def extraction(args: Arguments, start: int):
if start == 0:
print("Extracting photo id")
photo_id_to_path = load_json(args.cache)
if start == 0:
print(f"Found {len(photo_id_to_path)} images")
tsv_files = list(glob(f"{args.images}/**/*.tsv.*", recursive=True))
if start == 0:
print(f"Found {len(tsv_files)}")
tsv_files = tsv_files[start :: args.num_splits]
output = f".done.{start}.txt"
# empty file
with open(output, "w") as fid:
print(f"Creating {output}")
for f in tqdm(tsv_files):
with open(f".done.{start}.txt", "a") as fout:
with open(f) as fid:
reader = csv.DictReader(fid, TSV_FIELDNAMES, delimiter="\t")
for item in reader:
done = photo_id_to_path[item["photo_id"]]
fout.write(f"{done}\n")
if __name__ == "__main__":
args = Arguments()
print(args.to_string(width=80))
cache_photo_paths(args.images, args.cache)
if args.num_workers == 0:
extraction(args, args.start)
elif args.num_workers > 1:
p = Pool(args.num_workers)
p.starmap(
extraction,
[
(args, proc_id)
for proc_id in range(args.start, args.start + args.num_workers)
],
)