-
Notifications
You must be signed in to change notification settings - Fork 0
/
webfictionscraper.py
246 lines (198 loc) · 9.56 KB
/
webfictionscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import click
import questionary
from questionary import Choice, Separator
from click import echo, progressbar
from scraper import FictionScraperClient
client = FictionScraperClient()
HEADLESS = False
@click.group()
@click.option("--headless", is_flag=True, help="Do not prompt for any user input")
def cli(headless):
if headless or os.environ.get("SCRAPER_HEADLESS") == "true":
global HEADLESS
HEADLESS = True
@cli.command()
@click.pass_context
def interactive(ctx):
"""Start the scraper interactively."""
if HEADLESS:
echo("The interactive CLI is not available because headless mode is active.")
echo("Don't pass the --headless argument and set the SCRAPER_HEADLESS env variable to anything but 'true'.")
return
configs = client.list_fiction_configs()
choices = []
if len(configs) > 0:
with progressbar(configs, label="Loading configs") as bar:
for config_name in bar:
try:
config = client.load_fiction_config(config_name)
except Exception as error:
echo('Error loading config \'%s\': %s' % (config_name, error))
input('Press enter to continue...')
continue
if config:
choices.append(Choice(title=config.metadata.title, value='config:%s' % config_name))
else:
input('Press enter to continue...')
choices.sort(key=lambda c: c.title)
else:
choices.append(Choice("None", disabled="No configs found"))
choices = [
Choice(title="Download config(s)", value="download"),
Choice(title="Generate new config", value="generate"),
Separator()
] + choices
click.clear()
selected = questionary.select(
"What do you want to do or which config do you want to run?",
choices=choices
).ask()
if not selected:
return
if selected == "download":
configs = client.list_fiction_configs(remote=True)
choices = [
Choice("All (%s) configs" % len(configs), value="all"),
Separator()
] + [Choice(c, value="config:%s" % c) for c in configs]
configs_to_download = questionary.checkbox(
"Which configs do you want to download?",
choices=choices,
).ask()
if not configs_to_download:
return
if "all" in configs_to_download:
ctx.invoke(download_config, all=True)
else:
configs_to_download = [c.replace("config:", "") for c in configs_to_download]
with progressbar(configs_to_download, label="Downloading configs") as bar:
for c in bar:
ctx.invoke(download_config, config_name=c, prompt=False)
elif selected == "generate":
answers = questionary.form(
url=questionary.text("What's the URL of the web fiction or the first chapter?"),
name=questionary.text("What do you want to name the config? (Optional, press Enter for default)")
).ask()
if not answers:
return
ctx.invoke(generate_config, url=answers.get("url"), name=answers.get("name"))
elif selected.startswith("config:"):
config_name = selected.replace("config:", "")
tasks = questionary.checkbox(
"Which tasks do you want to run?",
choices=[
Choice(title="Download chapters", value="download", checked=True),
Choice(title="Clean download chapters", value="clean_download", checked=False),
Choice(title="Convert chapters", value="convert", checked=True),
Choice(title="Clean convert chapters", value="clean_convert", checked=False),
Choice(title="Bind chapters into eBook", value="bind", checked=True),
Choice(title="Create eBook formats specified in the config", value="format", checked=False)
]
).ask()
if not tasks:
return
client.run(
config_name,
"download" in tasks,
"clean_download" in tasks,
"convert" in tasks,
"clean_convert" in tasks,
"bind" in tasks,
"format" in tasks
)
if questionary.confirm("Do you want to return to the main menu?").ask():
click.clear()
ctx.invoke(interactive)
@cli.command()
@click.argument("config_name")
@click.option("--download/--no-download", default=True, help="Enable/disable chapter download")
@click.option("--clean-download", is_flag=True, help="Clear existing downloaded chapters")
@click.option("--convert/--no-convert", default=True, help="Enable/disable chapter conversion")
@click.option("--clean-convert", is_flag=True, help="Clear existing converted chapters")
@click.option("--bind/--no-bind", default=True, help="Enable/disable eBook creation")
@click.option("--ebook-convert/--no-ebook-convert", default=True, help="Create eBook formats specified in the config")
def run(config_name, download, clean_download, convert, clean_convert, bind, ebook_convert):
"""Run the scraper with the provided CONFIG_NAME.
CONFIG_NAME can be a path to a YAML config file, the name of a built-in config or the name of a config inside
the users configs/ directory. To list all automatically detected config files, use the list-configs command.
"""
client.run(config_name, download, clean_download, convert, clean_convert, bind, ebook_convert)
@cli.command()
@click.option("--remote", "-r", is_flag=True, help="List all configs in the remote repository")
def list_configs(remote: bool):
"""List all detected configs."""
configs = client.list_fiction_configs(remote)
if remote and not configs:
echo("Could not get repository contents.")
elif len(configs) == 0:
echo("No configs available.")
else:
echo("Available configs:")
for c in configs:
echo(" %s" % c)
@cli.command()
@click.argument("config_name", required=False)
@click.option("--all", "-a", is_flag=True, help="Download all available remote configs")
@click.option("--overwrite", "-o", is_flag=True, help="Overwrite configs when they already exist")
def download_config(config_name: str, all: bool, overwrite: bool, prompt=True):
"""Download a config from the remote repository."""
if not config_name and not all:
echo("Either the config_name argument or the --all option is required.")
return
if all:
configs = client.list_fiction_configs(remote=True)
downloaded = []
with progressbar(configs, label="Downloading configs") as bar:
for c in bar:
success = client.download_fiction_config(c, overwrite)
if not success:
echo("Downloading config %s was not successful." % c)
else:
downloaded.append(c)
echo("Downloaded %s configs!" % len(downloaded))
else:
success = client.download_fiction_config(config_name, overwrite)
if not success:
echo("Download was not successful.")
return
echo("Config %s was downloaded!" % config_name)
if not HEADLESS and prompt and questionary.confirm("Do you want to run the config now?", default=False).ask():
client.run(config_name, True, False, True, False, True, False)
@cli.command()
@click.argument("url")
@click.option("--name", "-n", type=str, help="Name of the config (file)")
def generate_config(url, name=None):
"""Generate a config file for a fiction from a support site.
URL can be either the fictions overview page or a chapter (which will be used as the startUrl config entry).
Currently supported sites:
- Royal Road
- FictionPress
"""
config_name = client.generate_fiction_config(url, name)
if not HEADLESS and questionary.confirm("Do you want to run the config now?", default=False).ask():
client.run(config_name, True, False, True, False, True, False)
@cli.command()
def print_paths():
"""Print all paths used by the scraper."""
client.print_paths()
@cli.command()
@click.option("--all-folders", is_flag=True, help="Select all config folder")
@click.option("--config", "-c", multiple=True, help="Select specific configs folders")
@click.option("--orphan-folders", is_flag=True, help="Select config folders without a corresponding config file")
@click.option("--everything", is_flag=True, help="Delete the entire folder")
@click.option("--downloads", is_flag=True, help="Delete downloaded chapters AND converted chapters!")
@click.option("--converted", is_flag=True, help="Delete converted chapters")
@click.option("--books", is_flag=True, help="Delete books")
@click.option("--misc", is_flag=True, help="Delete files that do not match other filters, e.g. downloaded covers")
@click.option("--dry-run", is_flag=True, help="Do not perform any file system operations")
def clean_space(all_folders: bool, orphan_folders: bool, config: [str], everything: bool, downloads: bool, converted: bool, books: bool, misc: bool, dry_run: bool):
"""Delete files in the data directory to clean up disk space.
Specify which folders should be affected, either --all-folders, --config [NAME], or --orphan-folders.
Specify what should be deleted from these selected folders with -e, -d, -c, -b, or/and -m.
If the folders are emtpy afterwards, they will be deleted as well.
If nothing is selected, nothing will happen.
"""
client.clean_space(all_folders, orphan_folders, config, everything, downloads, converted, books, misc, dry_run)
if __name__ == "__main__":
cli()