-
Notifications
You must be signed in to change notification settings - Fork 3
/
INPUT_SCHEMA.json
78 lines (78 loc) · 2.96 KB
/
INPUT_SCHEMA.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
"title": "Rust crawler",
"type": "object",
"schemaVersion": 1,
"required": ["urls", "extract"],
"properties": {
"urls": {
"title": "Start URLs",
"type": "array",
"description": "URLs that will be scraped. Must be an array of objects with \"url\" property.",
"prefill": [
{ "url": "http://example.com" }
],
"editor": "requestListSources"
},
"extract": {
"title": "Extraction config",
"type": "array",
"description": "Array that defines what and how should be scraped from a page HTML. See readme for more info.",
"editor": "json",
"prefill": [
{
"field_name": "title",
"selector": "h1",
"extract_type": {
"type": "Text"
}
},
{
"field_name": "description",
"selector": "p",
"extract_type": {
"type": "Text"
}
}
]
},
"proxy_settings": {
"title": "Proxy configuration",
"type": "object",
"description": "Select proxies to be used by your crawler. For most use cases we recommend the default Apify automatic proxy.",
"prefill": { "useApifyProxy": true },
"editor": "proxy"
},
"max_concurrency": {
"title": "Max concurrency",
"type": "integer",
"description": "Sets the maximum concurrency (parallelism) for the crawl. Keep this is reasonable level because this scraper can go really fast.",
"default": 50,
"minimum": 1
},
"max_request_retries": {
"title": "Max request retries",
"type": "integer",
"description": "Sets the maximum number of retries for each request(URL).",
"default": 3,
"minimum": 1
},
"debug_log": {
"title": "Debug log",
"type": "boolean",
"description": "Shows when each URL starts and ends scraping with timings. Don't use for larger runs as the log gets filled quickly.",
"default": false
},
"push_data_size": {
"title": "Push data buffer size",
"type": "integer",
"description": "Buffers results into vector (array) before pushing to a dataset. This prevents overwhelming Apify API. The default number is usually a good choice.",
"default": 500
},
"force_cloud": {
"title": "Force cloud",
"type": "boolean",
"description": "This allows local runs to use cloud storage, mainly for testing. On Apify platform this has no effect.",
"default": false
}
}
}