forked from klory/CookGAN
-
Notifications
You must be signed in to change notification settings - Fork 2
/
clean_recipes_with_canonical_ingrs.py
executable file
·112 lines (98 loc) · 3.37 KB
/
clean_recipes_with_canonical_ingrs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
from tqdm import tqdm
import json
import argparse
import pickle
import re
from common import root, tok, remove_numbers
import common
import numpy as np
parser = argparse.ArgumentParser(
description='clean recipes')
parser.add_argument(
'--data_dir', default=f'{root}/data/Recipe1M',
help='the folder which contains Recipe1M text files')
parser.add_argument("--lower", type=int, default=0, choices=[0,1])
parser.add_argument("--remove_numbers", type=int, default=0, choices=[0,1])
args = parser.parse_args()
data_dir = args.data_dir
print('load recipes (20 seconds)')
recipes_original = common.Layer.merge(
[common.Layer.L1, common.Layer.L2, common.Layer.INGRS],
os.path.join(data_dir, 'texts'))
for rcp in recipes_original:
rcp['instructions'] = [x['text'] for x in rcp['instructions']]
rcp['ingredients'] = [x['text'] for x in rcp['ingredients']]
with open(f'{root}/manual_files/replacement_dict.pkl', 'rb') as f:
replace_dict = pickle.load(f)
print('start processing')
cvgs = []
recipes = []
recipes_withImage = []
for rcp in tqdm(recipes_original):
insts = []
for inst in rcp['instructions']:
# words = tok(inst['text']).split()
words = tok(inst).split()
inst_ = ' '.join(words)
insts.append(inst_)
insts = '\n'.join(insts)
if len(insts) == 0:
continue
title = rcp['title']
words = tok(title).split()
title = ' '.join(words)
if args.lower:
insts = insts.lower()
title = title.lower()
if args.remove_numbers:
insts = remove_numbers(insts)
title = remove_numbers(title)
ingrs = []
N = len(rcp['ingredients'])
n = 0
for ingr in rcp['ingredients']:
# 1. add 'space' before and after 12 punctuation
# 2. change 'space' to 'underscore'
# ingr_name = ingr['text']
ingr_name = ingr
name = re.sub(' +', ' ', tok(ingr_name)).replace(' ', '_')
if name in replace_dict:
final_name = replace_dict[name]
ingrs.append(final_name)
name1 = final_name.replace('_',' ')
if args.lower:
ingr_name = ingr_name.lower()
name1 = name1.lower()
insts = insts.replace(ingr_name, final_name)
insts = insts.replace(name1, final_name)
title = title.replace(ingr_name, final_name)
title = title.replace(name1, final_name)
n += 1
if n==0:
print('no ingredients, discard')
continue
cvg = n/N
cvgs.append(cvg)
rcp['title'] = title
rcp['ingredients'] = ingrs
rcp['instructions'] = insts.split('\n')
recipes.append(rcp)
if 'images' in rcp and len(rcp['images'])>0:
recipes_withImage.append(rcp)
cvgs = np.array(cvgs)
print('cvg = {:.2f} -- {:.2f}'.format(cvgs.mean(), cvgs.std()))
print(len(recipes), len(recipes_withImage))
print('saving...')
if args.lower and not args.remove_numbers:
filename = 'recipes_lower'
elif not args.lower and args.remove_numbers:
filename = 'recipes_noNumbers'
elif args.remove_numbers and args.lower:
filename = 'recipes_lower_noNumbers'
else:
filename = 'recipes'
with open(os.path.join(data_dir, '{}.json'.format(filename)), 'w') as f:
json.dump(recipes, f, indent=2)
with open(os.path.join(data_dir, '{}_withImage.json'.format(filename)), 'w') as f:
json.dump(recipes_withImage, f, indent=2)