-
Notifications
You must be signed in to change notification settings - Fork 1
/
ao3-download.pl
181 lines (147 loc) · 6 KB
/
ao3-download.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/perl
# Keep it clean
use strict 'subs';
use warnings;
use threads; # NOTE We could probably have threading be
# fully disabled if $processes == 0
# All modules are threadsafe and in the standard distribution
use Getopt::Long;
use Pod::Usage;
use File::Fetch;
use URI::Escape;
use Thread::Queue;
use Thread::Semaphore;
# Setup options for modules
Getopt::Long::Configure (
'auto_abbrev', # Allows truncation of options
'gnu_compat' # Allows --opt=BLA syntax and --opt "BLA"
);
$File::Fetch::BLACKLIST = [qw|lwp|]; # Breaks the Archive for an unknown reason
$File::Fetch::WARN = 0; # We can handle the errors
# Initialize default values for configurable sections of the program
my $uid = undef; # Required (can't make base URL without it)
my $procs = 1; # Number of worker threads
my $format = "epub"; # Extension of downloaded works
my $directory = "."; # Directory to drop downloads in
my $section = "bookmarks"; # Section of user's profile to get
my $help = 0; # Flag to print help and quit out
my $retries = 30; # Number of times to retry downloads
GetOptions (
'uid=s' => \$uid,
'processes:+' => \$procs,
'format=s' => \$format,
'directory=s' => \$directory,
'section=s' => \$section,
'help' => \$help,
)
and (
$uid # $uid is mandatory
&& $procs > 0 # Can't have less than one download thread
&& $format =~ # $format must be
m{^(?:epub|pdf|html|mobi)$} # epub, pdf, html, or mobi
)
or die pod2usage( # Print documentation and quit if bad opts
-exitval => $help, # With return value 0 if $help was not set
-verbose => 2 # Print all the sections
);
# Threading initialization section
print "Starting download threads...";
my $mutex = Thread::Semaphore->new(); # When $mutex is up, then the thread has
STDOUT->autoflush(); # exclusive STDOUT control
my $queue = Thread::Queue->new(); # Queue feeds URLs to download to workers
threads->create(\&worker) # Create $procs download threads
for 1 .. $procs;
print "Done!\r\n";
# Scrape all pages of work links
print "Downloading list of works...";
my $sectionContents; # Total section contents aggregator
# Initial URI to scrape
my $uri = 'https://archiveofourown.org/users/'.$uid.'/'.$section;
do {
my $fetchy = ""; # Create blank variable to fetch page to
# Initialize fetcher with $uri as the target
my $fetcher = File::Fetch->new(uri => $uri);
my $where = $fetcher->fetch(to => \$fetchy);
my $err = $fetcher->error(0);
die <<ERROR
Error in retrieving $section for $uid from $uri: $err.
ERROR
if $err;
undef $fetcher; # Garbage collect the File::Fetch object
$sectionContents .= $fetchy;
# Check for next page to scrape and scrape it or quit
$fetchy =~ /rel="next" href="([^"]+)"/;
$uri = 'https://archiveofourown.org'.($1?$1:'');
} while ($uri && $uri ne 'https://archiveofourown.org');
print "Done!\r\n";
# Queue the work links found (feedback is from the threads)
# Split on newlines
my @lines = split /$/m, $sectionContents;
undef $sectionContents; # Garbage collect the string section
my $workCount = 0; # A counter for works queued for download
# TODO make this reflect works actually downloaded somehow
while (defined(my $line = shift @lines)) {
# If a line with a work link
if ($line =~ m{<a href="/works/[0-9]+"}) {
$workCount++;
# Ignore the next three lines
shift @lines; shift @lines; shift @lines;
# The fourth line is important though
$line .= shift @lines;
# Split the line into parts
my @parts = $line =~ m{works/(\d*)[^>]*>([^<]*).*/users/([^/]*?)/pseuds/([^"]*)}s;
$parts[5] = $parts[1];
$parts[5] =~ s/[^\w _-]+//g;
$parts[5] = "Work by " . $parts[2] if $parts[5] eq "";
$parts[5] =~ s/ +/ /g;
$parts[5] = uri_escape substr $parts[5], 0, 24;
# Queue the download for the current work URL
$queue->enqueue([$parts[1], join('/', 'https://archiveofourown.org/downloads', @parts[0,5]) . '.' . $format]);
}
}
undef @lines; # Garbage collect the lines
$queue->end;
$_->join() foreach threads->list;
print 'Fetched ', $workCount, " works.\r\n";
# Worker thread subroutine (download url to a file in a directory)
sub worker {
while (my $t = $queue->dequeue) {
$mutex->down();
print 'Fetching ', @$t[0], ' ';
my $fetcher;
my $tries = 0;
do {
$fetcher = File::Fetch->new(uri => @$t[1]);
$fetcher->fetch(to => $directory);
print "." if ($fetcher->error); # Show a retry bar
$tries++;
} while ($tries < $retries && $fetcher->error);
if ($tries == $retries) {
print 'Failed to fetch ', @$t[1], " :(\r\n";
} else {
print "... Done!\r\n";
}
$mutex->up();
}
}
__END__
=head1 NAME
AO3 Downloader
=head1 SYNOPSIS
ao3-download --uid UID [options]
=head1 OPTIONS
=over 12
=item B<--uid>
User ID on AO3. [required]
=item B<--processes>
Processes to run at once.
=item B<--format>
Format to download works in. Valid values are epub (default), mobi, pdf, and html.
=item B<--directory>
Where to download files (default current directory).
=item B<--section>
Section to download. Valid values are bookmarks (default), and works. (Collections and Serieses are not supported at this time).
=back
=head1 DESCRIPTION
B<This program> will download the works found for a section of a given user page.
=cut