newscrawler.cfg

# IMPORTANT
# All variables get parsed to the correct python-types (if not other declared)!
# So bools have to be True or False (uppercase-first),
# Floats need dots . (not comma)
# Ints are just normal ints
# dicts need to be like this { key: value }
# arrays need to be like this [ value1, value2, value3 ]
# All values in dicts and arrays will also be parsed.
# Everything that does not match any of the above criteria will be parsed as string.

[General]

# Set here how you call Python 2.7 in your command line or terminal.
# 'python' should always invoke the same version of Python as 'python2' (which does not need to exist) according to pep-394. Some systems might just not follow this standard though, thus this setting exists.
# Examples: 'python', 'python2', 'python2.7'
# default: python
python_command = python


[Crawler]

# GENERAL
# -------

# Crawling heuristics
# Default Crawlers:
# Possibilities: RecursiveCrawler, RecursiveSitemapCrawler, RssCrawler, SitemapCrawler, Download (./newscrawler/crawler/spiders/-dir)
# default: SitemapCrawler
default = SitemapCrawler

# default:
# fallbacks = {
#     "RssCrawler": None,
#     "RecursiveSitemapCrawler": "RecursiveCrawler",
#     "SitemapCrawler": "RecursiveCrawler",
#     "RecursiveCrawler": None,
#     "Download": None
#     }
fallbacks = {
    "RssCrawler": None,
    "RecursiveSitemapCrawler": "RecursiveCrawler",
    "SitemapCrawler": "RecursiveCrawler",
    "RecursiveCrawler": None,
    "Download": None
    }

# Determines how many hours need to pass since the last download of a webpage
# to be downloaded again by the RssCrawler
# default: 6
hours_to_pass_for_redownload_by_rss_crawler = 6


# PROCESSES
# ---------

# Number of crawlers, that should crawl parallel
# not counting in daemonized crawlers
# default: 5
number_of_parallel_crawlers = 5

# Number of daemons, will be added to daemons.
# default: 10
number_of_parallel_daemons = 10


# SPECIAL CASES
# -------------

# urls which end on any of the following file extensions are ignored for recursive crawling
# default: "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"
ignore_file_extensions = "(pdf)|(docx?)|(xlsx?)|(pptx?)|(epub)|(jpe?g)|(png)|(bmp)|(gif)|(tiff)|(webp)|(avi)|(mpe?g)|(mov)|(qt)|(webm)|(ogg)|(midi)|(mid)|(mp3)|(wav)|(zip)|(rar)|(exe)|(apk)|(css)"

# urls which match the following regex are ignored for recursive crawling
# default: ""
ignore_regex = ""

# Crawl the sitemaps of subdomains (if sitemap is enabled)
# If True, any SitemapCrawler will try to crawl on the sitemap of the given domain including subdomains instead of a domain's main sitemap.
# e.g. if True, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://blog.zeit.de/robots.txt. If not found, it will fall back to the False setting.
#      if False, a SitemapCrawler to be started on https://blog.zeit.de will try to crawl on the sitemap listed in http://zeit.de/robots.txt
# default: True
sitemap_allow_subdomains = True


[Heuristics]

# Enabled heuristics,
# Currently:
#    - og_type
#    - linked_headlines
#    - self_linked_headlines
#    - is_not_from_subdomain (with this setting enabled, it can be assured that only pages that aren't from a subdomain are downloaded)
#    - meta_contains_article_keyword
#    - crawler_contains_only_article_alikes
# (maybe not up-to-date, see ./newscrawler/helper_classes/heursitics.py:
#  Every method not starting with __ should be a heuristic, except is_article)
# These heuristics can be overwritten by input_data.json for every site
# default: {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56"}
enabled_heuristics = {"og_type": True, "linked_headlines": "<=0.65", "self_linked_headlines": "<=0.56", "crawler_contains_only_article_alikes": True}

# Heuristics can be combined with others
# The heuristics need to have the same name as in enabled_heuristics
# Possible condition-characters / literals are: (, ), not, and, or
# All heuristics used here need to be enabled in enabled_heuristics as well!
# Exambles:
#     "og_type and (self_linked_headlines or linked_headlines)"
#     "og_type"
# default: "og_type and (self_linked_headlines or linked_headlines)"
pass_heuristics_condition = "og_type and (self_linked_headlines or linked_headlines) or crawler_contains_only_article_alikes"

# The maximum ratio of headlines divided by linked_headlines in a file

# The minimum number of headlines in a file to check for the ratio
# If less then this number are in the file, the file will pass the test.
# default: 5
min_headlines_for_linked_test = 5


[Files]

# GENERAL:
# -------

# Paths:
# toggles relative paths to be relative to the start_processes.py script (True) or relative to this config file (False)
# This does not work for this config's 'Scrapy' section which is always relative to the dir the start_processes.py script is called from
# Default: True
relative_to_start_processes_file = True


# INPUT:
# -----

# Here you can specify the input JSON-File
# The input-file file containing the base-urls to crawl
# absolute and relative file paths are allowed
# default: ./input_data.hjson
url_input = ./input_data.hjson


# OUTPUT:
# ------

# Following Strings in the local_data_directory will be replaced: (md5 hashes have a standard length of 32 chars)
#
# %time_download(<code>)                  = current time at download; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
# %time_execution(<code>)                 = current time at execution; will be replaced with strftime(<code>) where <code> is a string, explained further here: http://strftime.org/
# %timestamp_download                     = current time at download; unix-timestamp
# %timestamp_execution                    = current time at execution; unix-timestamp
# %domain(<size>)                         = first <size> chars of the domain of the crawled file (e.g. zeit.de)
# %appendmd5_domain(<size>)               = appends the md5 to %domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if domain is longer than <size>
# %md5_domain(<size>)                     = first <size> chars of md5 hash of %domain
# %full_domain(<size>)                    = first <size> chars of the domain including subdomains (e.g. panamapapers.sueddeutsche.de)
# %appendmd5_full_domain(<size>)          = appends the md5 to %full_domain(<<size> - 32 (md5 length) - 1 (_ as separator)>) if full_domain is longer than <size>
# %md5_full_domain(<size>)                = first <size> chars of md5 hash of %full_domain
# %subdomains(<size>)                     = first <size> chars of the domain's subdomains
# %appendmd5_subdomains(<size>)           = appends the md5 to %subdomains(<<size> - 32 (md5 length) - 1 (_ as separator)>) if subdomains is longer than <size>
# %md5_subdomains(<size>)                 = first <size> chars of md5 hash of %subdomains
# %url_directory_string(<size>)           = first <size> chars of the directories on the server (e.g. http://panamapapers.sueddeutsche.de/articles/56f2c00da1bb8d3c3495aa0a/ would evaluate to articles_56f2c00da1bb8d3c3495aa0a), no filename
# %appendmd5_url_directory_string(<size>) = appends the md5 to %url_directory_string(<<size> - 32 (md5 length) - 1 (_ as separator)>) if url_directory_string is longer than <size>
# %md5_url_directory_string(<size>)       = first <size> chars of md5 hash of %url_directory_string(<size>)
# %url_file_name(<size>)                  = first <size> chars of the file name (without type) on the server (e.g. http://www.spiegel.de/wirtschaft/soziales/ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466.html would evaluate to ttip-dokumente-leak-koennte-ende-der-geheimhaltung-markieren-a-1090466, No filenames (indexes) will evaluate to index
# %md5_url_file_name(<size>)              = first <size> chars of md5 hash of %url_file_name
# %max_url_file_name                      = first x chars of %url_file_name, so the entire savepath has a length of the max possible length for a windows file system (260 characters - 1 <NUL>)
# %appendmd5_max_url_file_name            = appends the md5 to the first x - 32 (md5 length) - 1 (_ as separator) chars of %url_file_name if the entire savepath has a length longer than the max possible length for a windows file system (260 characters - 1 <NUL>)
#
# This path can be relative or absolute, though to be able to easily merge multiple data sets, it should be kept relative and consistent on all datasets.
# To be able to use cleanup commands, it should also start with a static folder name like 'data'
#
# default: ./data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html
local_data_directory = ./data/%time_execution(%Y)/%time_execution(%m)/%time_execution(%d)/%appendmd5_full_domain(32)/%appendmd5_url_directory_string(60)_%appendmd5_max_url_file_name_%timestamp_download.html

# Toggles whether leading './' or '.\' from above local_data_directory should be removed when saving the path into the Database
# True: ./data would become data
# default: True
format_relative_path = True


[Database]

# MySQL-Connection required for saving meta-informations
host = db.dbvis.de
port = 3306
db = ccolon
username = ccolon
password = b3eY7Tep2F7Pg559Vg0W


[Scrapy]

# Possible levels (must be UC-only): CRITICAL, ERROR, WARNING, INFO, DEBUG
# default: WARNING
LOG_LEVEL = WARNING

# logformat, see https://docs.python.org/2/library/logging.html#logrecord-attributes
# default: [%(name)s:%(lineno)d|%(levelname)s] %(message)s
LOG_FORMAT = [%(name)s:%(lineno)d|%(levelname)s] %(message)s

# Can be a filename or None
# default: None
LOG_FILE = None

LOG_DATEFORMAT = %Y-%m-%d %H:%M:%S

LOG_STDOUT = False

LOG_ENCODING = utf-8

BOT_NAME = 'newscrawler'

SPIDER_MODULES = ['newscrawler.crawler.spiders']
NEWSPIDER_MODULE = 'newscrawler.crawler.spiders'

# Resume/Pause functionality activation
# default: ./.resume_jobdir
JOBDIR = ./.resume_jobdir

# Respect robots.txt activation
# default: False
ROBOTSTXT_OBEY=True

# Maximum number of concurrent requests across all domains
# default: 16
# IMPORTANT: This setting does not work since each crawler has its own scrapy instance, but it might limit the concurrent_requests_per_domain if said setting has a higher number set than this one.
CONCURRENT_REQUESTS=16

# Maximum number of active requests per domain
# default: 4
CONCURRENT_REQUESTS_PER_DOMAIN=4

# User-agent activation
# default: 'ccolon_newscrawler (+http://www.uni-konstanz.de)'
USER_AGENT = 'ccolon_newscrawler (+http://www.uni-konstanz.de)'

# Pipeline activation
# Syntax: '<relative location>.<Pipeline name>': <Order of execution from 0-1000>
# default: {'newscrawler.crawler.pipelines.RSSCrawlCompare':300, 'newscrawler.crawler.pipelines.LocalStorage':400, 'newscrawler.crawler.pipelines.DatabaseStorage': 500}
ITEM_PIPELINES = {'newscrawler.crawler.pipelines.RSSCrawlCompare':300, 'newscrawler.crawler.pipelines.LocalStorage':400, 'newscrawler.crawler.pipelines.DatabaseStorage': 500}