-
Notifications
You must be signed in to change notification settings - Fork 1
/
newscrawler.cfg
247 lines (185 loc) · 11 KB
/
newscrawler.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# IMPORTANT
# All variables get parsed to the correct python-types (if not other declared)!
# So bools have to be True or False (uppercase-first),
# Floats need dots . (not comma)
# Ints are just normal ints
# dicts need to be like this { key: value }
# arrays need to be like this [ value1, value2, value3 ]
# All values in dicts and arrays will also be parsed.
# Everything that does not match any of the above criteria will be parsed as string.
[General]
# Set here how you call Python 2.7 in your command line or terminal.
# 'python' should always invoke the same version of Python as 'python2' (which does not need to exist) according to pep-394. Some systems might just not follow this standard though, thus this setting exists.
# Examples: 'python', 'python2', 'python2.7'
# default: python
python_command = python
[Crawler]
# GENERAL
# -------
# Crawling heuristics
# Default Crawlers:
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newscrawler/crawler/spiders/-dir)
# default: SitemapCrawler
default = SitemapCrawler
# default:
# fallbacks = {
# "RssCrawler": None,
# "RecursiveSitemapCrawler": "RecursiveCrawler",
# "SitemapCrawler": "RecursiveCrawler",
# "RecursiveCrawler": None,
# "Download": None
# }
fallbacks = {
"RssCrawler": None,
"RecursiveSitemapCrawler": "RecursiveCrawler",
"SitemapCrawler": "RecursiveCrawler",
"RecursiveCrawler": None,
"Download": None
}
# Determines how many hours need to pass since the last download of a webpage
# to be downloaded again by the RssCrawler
# default: 6
hours_to_pass_for_redownload_by_rss_crawler = 6
# PROCESSES
# ---------
# Number of crawlers, that should crawl parallel
# not counting in daemonized crawlers
# default: 5
number_of_parallel_crawlers = 5
# Number of daemons, will be added to daemons.
# default: 10
number_of_parallel_daemons = 10
# SPECIAL CASES
# -------------
# urls which end on any of the following file extensions are ignored for recursive crawling
# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
# urls which match the following regex are ignored for recursive crawling
# default: ""
ignore_regex = ""
# Crawl the sitemaps of subdomains (if sitemap is enabled)
# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
# if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
# default: True
sitemap_allow_subdomains = True
[Heuristics]
# Enabled heuristics,
# Currently:
# - og_type
# - linked_headlines
# - self_linked_headlines
# - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
# - meta_contains_article_keyword
# - crawler_contains_only_article_alikes
# (maybe not up-to-date, see ./newscrawler/helper_classes/heursitics.py:
# Every method not starting with __ should be a heuristic, except is_article)
# These heuristics can be overwritten by input_data.json for every site
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56", "crawler_contains_only_article_alikes": True}
# Heuristics can be combined with others
# The heuristics need to have the same name as in enabled_heuristics
# Possible condition-characters / literals are: (, ), not, and, or
# All heuristics used here need to be enabled in enabled_heuristics as well!
# Exambles:
# "og_type and (self_linked_headlines or linked_headlines)"
# "og_type"
# default: "og_type and (self_linked_headlines or linked_headlines)"
pass_heuristics_condition = "og_type and (self_linked_headlines or linked_headlines) or crawler_contains_only_article_alikes"
# The maximum ratio of headlines divided by linked_headlines in a file
# The minimum number of headlines in a file to check for the ratio
# If less then this number are in the file, the file will pass the test.
# default: 5
min_headlines_for_linked_test = 5
[Files]
# GENERAL:
# -------
# Paths:
# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
# Default: True
relative_to_start_processes_file = True
# INPUT:
# -----
# Here you can specify the input JSON-File
# The input-file file containing the base-urls to crawl
# absolute and relative file paths are allowed
# default: ./input_data.hjson
url_input = ./input_data.hjson
# OUTPUT:
# ------
# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
#
# %time_download(<code>) = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
# %time_execution(<code>) = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
# %timestamp_download = current time at download; unix-timestamp
# %timestamp_execution = current time at execution; unix-timestamp
# %domain(<size>) = first <size> chars of the domain of the crawled file (e.g. zeit.de)
# %appendmd5_domain(<size>) = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
# %md5_domain(<size>) = first <size> chars of md5 hash of %domain
# %full_domain(<size>) = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
# %appendmd5_full_domain(<size>) = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
# %md5_full_domain(<size>) = first <size> chars of md5 hash of %full_domain
# %subdomains(<size>) = first <size> chars of the domain's subdomains
# %appendmd5_subdomains(<size>) = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
# %md5_subdomains(<size>) = first <size> chars of md5 hash of %subdomains
# %url_directory_string(<size>) = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
# %md5_url_directory_string(<size>) = first <size> chars of md5 hash of %url_directory_string(<size>)
# %url_file_name(<size>) = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
# %md5_url_file_name(<size>) = first <size> chars of md5 hash of %url_file_name
# %max_url_file_name = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
# %appendmd5_max_url_file_name = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
#
# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
# To be able to use cleanup commands, it should also start with a static folder name like 'data'
#
# default: ./data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
local_data_directory = ./data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
# True: ./data would become data
# default: True
format_relative_path = True
[Database]
# MySQL-Connection required for saving meta-informations
host = db.dbvis.de
port = 3306
db = ccolon
username = ccolon
password = b3eY7Tep2F7Pg559Vg0W
[Scrapy]
# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
# default: WARNING
LOG_LEVEL = WARNING
# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s
# Can be a filename or None
# default: None
LOG_FILE = None
LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S
LOG_STDOUT = False
LOG_ENCODING = utf-8
BOT_NAME = 'newscrawler'
SPIDER_MODULES = ['newscrawler.crawler.spiders']
NEWSPIDER_MODULE = 'newscrawler.crawler.spiders'
# Resume/Pause functionality activation
# default: ./.resume_jobdir
JOBDIR = ./.resume_jobdir
# Respect robots.txt activation
# default: False
ROBOTSTXT_OBEY=True
# Maximum number of concurrent requests across all domains
# default: 16
# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
CONCURRENT_REQUESTS=16
# Maximum number of active requests per domain
# default: 4
CONCURRENT_REQUESTS_PER_DOMAIN=4
# User-agent activation
# default: 'ccolon_newscrawler (+http://www.uni-konstanz.de)'
USER_AGENT = 'ccolon_newscrawler (+http://www.uni-konstanz.de)'
# Pipeline activation
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
# default: {'newscrawler.crawler.pipelines.RSSCrawlCompare':300, 'newscrawler.crawler.pipelines.LocalStorage':400, 'newscrawler.crawler.pipelines.DatabaseStorage': 500}
ITEM_PIPELINES = {'newscrawler.crawler.pipelines.RSSCrawlCompare':300, 'newscrawler.crawler.pipelines.LocalStorage':400, 'newscrawler.crawler.pipelines.DatabaseStorage': 500}