forked from rajbot/CDX-Writer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cdx_writer.py
executable file
·367 lines (296 loc) · 12.9 KB
/
cdx_writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#!/usr/bin/env python
""" This script requires a modified version of Hanzo Archives' warc-tools:
http://code.hanzoarchives.com/warc-tools/src/tip/hanzo/warctools
This script is loosely based on warcindex.py:
http://code.hanzoarchives.com/warc-tools/src/1897e2bc9d29/warcindex.py
The functions that start with "get_" (as opposed to "parse_") are called be the
dispatch loop in make_cdx using getattr().
"""
from warctools import ArchiveRecord #from https://bitbucket.org/rajbot/warc-tools
from surt import surt #from https://github.com/rajbot/surt
import re
import sys
import base64
import hashlib
import urllib
import urlparse
import lxml.html
from urlparse import urlsplit
from datetime import datetime
from optparse import OptionParser
class CDX_Writer(object):
#___________________________________________________________________________
def __init__(self, file, format):
self.field_map = {'M': 'AIF meta tags',
'N': 'massaged url',
'S': 'compressed record size',
'V': 'compressed arc file offset',
'a': 'original url',
'b': 'date',
'g': 'file name',
'k': 'new style checksum',
'm': 'mime type',
'r': 'redirect',
's': 'response code',
}
self.file = file
self.format = format
self.crlf_pattern = re.compile('\r?\n\r?\n')
#this is what wayback uses:
self.fake_build_version = "archive-commons.0.0.1-SNAPSHOT-20120112102659"
#these fields are set for each record in the warc
self.offset = 0
self.mime_type = None
self.headers = None
self.content = None
self.meta_tags = None
self.response_code = None
# parse_http_header()
#___________________________________________________________________________
def parse_http_header(self, header_name):
if self.headers is None:
return None
pattern = re.compile(header_name+':\s*(.+)', re.I)
for line in iter(self.headers):
m = pattern.match(line)
if m:
return m.group(1)
return None
# parse_http_content_type_header()
#___________________________________________________________________________
def parse_http_content_type_header(self, record):
content_type = self.parse_http_header('content-type')
if content_type is None:
return 'unk'
m = re.match('(.+?);', content_type)
if m:
return m.group(1)
else:
return content_type
# parse_meta_tags
#___________________________________________________________________________
def parse_meta_tags(self, record):
"""We want to parse meta tags in <head>, even if not direct children.
e.g. <head><noscript><meta .../></noscript></head>
What should we do about multiple meta tags with the same name?
currently, we append the content attribs together with a comma seperator.
We use either the 'name' or 'http-equiv' attrib as the meta_tag dict key.
"""
if not ('response' == record.type and 'text/html' == self.mime_type):
return None
meta_tags = {}
#lxml.html can't parse blank documents
html_str = self.content.strip()
if '' == html_str:
return meta_tags
###TODO: is there a faster way than actually parsing the html?
###maybe use a regex, or maybe just parse the <head>.
try:
html = lxml.html.document_fromstring(html_str)
except lxml.etree.ParserError:
return meta_tags
try:
head = html.head
except IndexError:
#this might have been an xml response
return meta_tags
metas = head.xpath("//meta")
for meta in metas:
name = meta.get('name')
if name is None:
name = meta.get('http-equiv')
if name is not None:
name = name.lower()
content = meta.get('content')
if content is not None:
if name not in meta_tags:
meta_tags[name] = content
else:
meta_tags[name] += ',' + content
return meta_tags
# get_AIF_meta_tags() //field "M"
#___________________________________________________________________________
def get_AIF_meta_tags(self, record):
"""robot metatags, if present, should be in this order: A, F, I
"""
if not self.meta_tags:
return '-'
if 'robots' not in self.meta_tags:
return '-'
robot_tags = self.meta_tags['robots'].split(',')
robot_tags = [x.strip().lower() for x in robot_tags]
s = ''
if 'noarchive' in robot_tags:
s += 'A'
if 'nofollow' in robot_tags:
s += 'F'
if 'noindex' in robot_tags:
s += 'I'
if s:
return ''.join(s)
else:
return '-'
# get_massaged_url() //field "N"
#___________________________________________________________________________
def get_massaged_url(self, record):
if 'warcinfo' == record.type:
return self.get_original_url(record)
else:
return surt(record.url)
# get_compressed_record_size() //field "S"
#___________________________________________________________________________
def get_compressed_record_size(self, record):
return str(record.compressed_record_size)
# get_compressed_arc_file_offset() //field "V"
#___________________________________________________________________________
def get_compressed_arc_file_offset(self, record):
return str(self.offset)
# get_original_url() //field "a"
#___________________________________________________________________________
def get_original_url(self, record):
if 'warcinfo' == record.type:
url = 'warcinfo:/%s/%s' % (self.file, self.fake_build_version)
return url
return record.url
# get_date() //field "b"
#___________________________________________________________________________
def get_date(self, record):
date = datetime.strptime(record.date, "%Y-%m-%dT%H:%M:%SZ")
return date.strftime("%Y%m%d%H%M%S")
# get_file_name() //field "g"
#___________________________________________________________________________
def get_file_name(self, record):
return self.file
# get_new_style_checksum() //field "k"
#___________________________________________________________________________
def get_new_style_checksum(self, record):
"""Return a base32-encoded sha1
For revisit records, return the original sha1
"""
if 'revisit' == record.type:
digest = record.get_header('WARC-Payload-Digest')
return digest.replace('sha1:', '')
elif 'response' == record.type and 'application/http; msgtype=response' == record.content_type:
# Where does this WARC-Payload-Digest header come from?
# It does not match the sha1(record.content[1]), which might
# have something to do with the different content-type headers
# in the warc header and the actual http response
digest = record.get_header('WARC-Payload-Digest')
return digest.replace('sha1:', '')
else:
h = hashlib.sha1(record.content[1])
return base64.b32encode(h.digest())
# get_mime_type() //field "m"
#___________________________________________________________________________
def get_mime_type(self, record, use_precalculated_value=True):
""" See the WARC spec for more info on 'application/http; msgtype=response'
http://archive-access.sourceforge.net/warc/warc_file_format-0.16.html#anchor7
"""
if use_precalculated_value:
return self.mime_type
if 'response' == record.type and 'application/http; msgtype=response' == record.content_type:
return self.parse_http_content_type_header(record)
elif 'response' == record.type:
return record.content_type
elif 'warcinfo' == record.type:
return 'warc-info' #why special case this?
else:
return 'warc/'+record.type
# urljoin_with_fragments()
#___________________________________________________________________________
def urljoin_with_fragments(self, base, url):
"""urlparse.urljoin removes blank fragments (trailing #),
even if allow_fragments is set to True, so do this manually
"""
if url.lower().startswith('http'):
return url
else:
if not url.startswith('/'):
url = '/'+url
s = urlsplit(base)
return s.scheme+'://'+s.netloc+url
# get_redirect() //field "r"
#___________________________________________________________________________
def get_redirect(self, record):
response_code = self.response_code
## it turns out that the refresh tag is being used in both
## 2xx and 3xx responses.
#only deal with 2xx and 3xx responses:
#if 3 != len(response_code):
# return '-'
#if response_code.startswith('3'):
location = self.parse_http_header('location')
if location:
return self.urljoin_with_fragments(record.url, location)
#elif response_code.startswith('2'):
if self.meta_tags and 'refresh' in self.meta_tags:
redir_loc = self.meta_tags['refresh']
m = re.search('\d+;\s*url=(.+)', redir_loc, re.I) #url might be capitalized
if m:
return self.urljoin_with_fragments(record.url, m.group(1))
return '-'
# get_response_code() //field "s"
#___________________________________________________________________________
def get_response_code(self, record, use_precalculated_value=True):
if use_precalculated_value:
return self.response_code
if 'response' != record.type:
return '-'
m = re.match("HTTP/\d\.\d (\d+)", record.content[1])
if m:
return m.group(1)
else:
return '-'
# split_headers_and_content()
#___________________________________________________________________________
def parse_headers_and_content(self, record):
"""Returns a list of header lines, split with splitlines(), and the content.
We call splitlines() here so we only split once, and so \r\n and \n are
split in the same way.
"""
if 'response' == record.type and record.content[1].startswith('HTTP'):
headers, content = self.crlf_pattern.split(record.content[1], 1)
headers = headers.splitlines()
else:
headers = None
content = None
return headers, content
# make_cdx()
#___________________________________________________________________________
def make_cdx(self):
print ' CDX ' + self.format #print header
fh = ArchiveRecord.open_archive(self.file, gzip="auto", mode="r")
for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
self.offset = offset
if record:
### precalculated data that is used multiple times
self.headers, self.content = self.parse_headers_and_content(record)
self.mime_type = self.get_mime_type(record, use_precalculated_value=False)
self.response_code = self.get_response_code(record, use_precalculated_value=False)
self.meta_tags = self.parse_meta_tags(record)
s = ''
for field in self.format.split():
if not field in self.field_map:
sys.exit('Unknown field: ' + field)
endpoint = self.field_map[field].replace(' ', '_')
response = getattr(self, 'get_' + endpoint)(record)
s += response + ' '
print s.rstrip()
#record.dump()
elif errors:
pass # ignore
else:
pass # tail
fh.close()
# main()
#_______________________________________________________________________________
if __name__ == '__main__':
parser = OptionParser(usage="%prog [options] warc")
parser.add_option("-f", "--format", dest="format")
parser.set_defaults(format="N b a m s k r M S V g")
(options, input_files) = parser.parse_args(args=sys.argv[1:])
if not 1 == len(input_files):
parser.print_help()
exit(-1)
cdx_writer = CDX_Writer(input_files[0], options.format)
cdx_writer.make_cdx()