forked from leopku/apachelog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
apachelog.py
executable file
·691 lines (598 loc) · 26.5 KB
/
apachelog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
#!/usr/bin/env python
"""Apache Log Parser
Parser for Apache log files. This is a port to python of Peter Hickman's
Apache::LogEntry Perl module:
<http://cpan.uwinnipeg.ca/~peterhi/Apache-LogRegex>
Takes the Apache logging format defined in your httpd.conf and generates
a regular expression which is used to a line from the log file and
return it as a dictionary with keys corresponding to the fields defined
in the log format.
Example:
import apachelog, sys
# Format copied and pasted from Apache conf - use raw string + single quotes
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
p = apachelog.parser(format)
for line in open('/var/apache/access.log'):
try:
data = p.parse(line)
except:
sys.stderr.write("Unable to parse %s" % line)
The return dictionary from the parse method depends on the input format.
For the above example, the returned dictionary would look like;
{
'%>s': '200',
'%b': '2607',
'%h': '212.74.15.68',
'%l': '-',
'%r': 'GET /images/previous.png HTTP/1.1',
'%t': '[23/Jan/2004:11:36:20 +0000]',
'%u': '-',
'%{Referer}i': 'http://peterhi.dyndns.org/bandwidth/index.html',
'%{User-Agent}i': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202'
}
...given an access log entry like (split across lines for formatting);
212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] "GET /images/previous.png HTTP/1.1"
200 2607 "http://peterhi.dyndns.org/bandwidth/index.html"
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202"
You can also re-map the field names by subclassing (or re-pointing) the
alias method.
Generally you should be able to copy and paste the format string from
your Apache configuration, but remember to place it in a raw string
using single-quotes, so that backslashes are handled correctly.
This module provides three of the most common log formats in the
formats dictionary;
# Common Log Format (CLF)
p = apachelog.parser(apachelog.formats['common'])
# Common Log Format with Virtual Host
p = apachelog.parser(apachelog.formats['vhcommon'])
# NCSA extended/combined log format
p = apachelog.parser(apachelog.formats['extended'])
For notes regarding performance while reading lines from a file
in Python, see <http://effbot.org/zone/readline-performance.htm>.
Further performance boost can be gained by using psyco
<http://psyco.sourceforge.net/>
On my system, using a loop like;
for line in open('access.log'):
p.parse(line)
...was able to parse ~60,000 lines / second. Adding psyco to the mix,
up that to ~75,000 lines / second.
The parse_date function is intended as a fast way to convert a log
date into something useful, without incurring a significant date
parsing overhead - good enough for basic stuff but will be a problem
if you need to deal with log from multiple servers in different
timezones.
"""
__version__ = "1.1"
__license__ = """Released under the same terms as Perl.
See: http://dev.perl.org/licenses/
"""
__author__ = "Harry Fuecks <[email protected]>"
__contributors__ = [
"Peter Hickman <[email protected]>",
"Loic Dachary <[email protected]>"
]
import re
class ApacheLogParserError(Exception):
pass
class AttrDict(dict):
"""
Allows dicts to be accessed via dot notation as well as subscripts
Makes using the friendly names nicer
"""
def __getattr__(self, name):
return self[name]
class parser:
format_to_name = {
# Explanatory comments copied from
# http://httpd.apache.org/docs/2.2/mod/mod_log_config.html
# Remote IP-address
'%a':'remote_ip',
# Local IP-address
'%A':'local_ip',
# Size of response in bytes, excluding HTTP headers.
'%B':'response_bytes',
# Size of response in bytes, excluding HTTP headers. In CLF
# format, i.e. a "-" rather than a 0 when no bytes are sent.
'%b':'response_bytes_clf',
# The contents of cookie Foobar in the request sent to the server.
# Only version 0 cookies are fully supported.
#'%{Foobar}C':'',
'%{}C':'cookie',
# The time taken to serve the request, in microseconds.
'%D':'response_time_us',
# The contents of the environment variable FOOBAR
#'%{FOOBAR}e':'',
'%{}e':'env',
# Filename
'%f':'filename',
# Remote host
'%h':'remote_host',
# The request protocol
'%H':'request_protocol',
# The contents of Foobar: header line(s) in the request sent to
# the server. Changes made by other modules (e.g. mod_headers)
# affect this.
#'%{Foobar}i':'',
'%{}i':'header',
# Number of keepalive requests handled on this connection.
# Interesting if KeepAlive is being used, so that, for example,
# a "1" means the first keepalive request after the initial one,
# "2" the second, etc...; otherwise this is always 0 (indicating
# the initial request). Available in versions 2.2.11 and later.
'%k':'keepalive_num',
# Remote logname (from identd, if supplied). This will return a
# dash unless mod_ident is present and IdentityCheck is set On.
'%l':'remote_logname',
# The request method
'%m':'request_method',
# The contents of note Foobar from another module.
#'%{Foobar}n':'',
'%{}n':'note',
# The contents of Foobar: header line(s) in the reply.
#'%{Foobar}o':'',
'%{}o':'reply_header',
# The canonical port of the server serving the request
'%p':'server_port',
# The canonical port of the server serving the request or the
# server's actual port or the client's actual port. Valid
# formats are canonical, local, or remote.
#'%{format}p':"",
'%{}p':'port',
# The process ID of the child that serviced the request.
'%P':'process_id',
# The process ID or thread id of the child that serviced the
# request. Valid formats are pid, tid, and hextid. hextid requires
# APR 1.2.0 or higher.
#'%{format}P':'',
'%{}P':'pid',
# The query string (prepended with a ? if a query string exists,
# otherwise an empty string)
'%q':'query_string',
# First line of request
# e.g., what you'd see in the logs as 'GET / HTTP/1.1'
'%r':'first_line',
# The handler generating the response (if any).
'%R':'response_handler',
# Status. For requests that got internally redirected, this is
# the status of the *original* request --- %>s for the last.
'%s':'status',
'%>s':'last_status',
# Time the request was received (standard english format)
'%t':'time',
# The time, in the form given by format, which should be in
# strftime(3) format. (potentially localized)
#'%{format}t':'TODO',
# The time taken to serve the request, in seconds.
'%T':'response_time_sec',
# Remote user (from auth; may be bogus if return status (%s) is 401)
'%u':'remote_user',
# The URL path requested, not including any query string.
'%U':'url_path',
# The canonical ServerName of the server serving the request.
'%v':'canonical_server_name',
# The server name according to the UseCanonicalName setting.
'%V':'server_name_config', #TODO: Needs better name
# Connection status when response is completed:
# X = connection aborted before the response completed.
# + = connection may be kept alive after the response is sent.
# - = connection will be closed after the response is sent.
'%X':'completed_connection_status',
# Bytes received, including request and headers, cannot be zero.
# You need to enable mod_logio to use this.
'%I':'bytes_received',
# Bytes sent, including headers, cannot be zero. You need to
# enable mod_logio to use this
'%O':'bytes_sent',
}
def __init__(self, format, use_friendly_names=False):
"""
Takes the log format from an Apache configuration file.
Best just copy and paste directly from the .conf file
and pass using a Python raw string e.g.
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
p = apachelog.parser(format)
"""
self._names = []
self._regex = None
self._pattern = ''
self._use_friendly_names = use_friendly_names
self._parse_format(format)
def _parse_format(self, format):
"""
Converts the input format to a regular
expression, as well as extracting fields
Raises an exception if it couldn't compile
the generated regex.
"""
format = format.strip()
format = re.sub('[ \t]+',' ',format)
subpatterns = []
findquotes = re.compile(r'^\\"')
findreferreragent = re.compile('Referer|User-Agent', re.I)
findpercent = re.compile('^%.*t$')
lstripquotes = re.compile(r'^\\"')
rstripquotes = re.compile(r'\\"$')
self._names = []
for element in format.split(' '):
hasquotes = 0
if findquotes.search(element): hasquotes = 1
if hasquotes:
element = lstripquotes.sub('', element)
element = rstripquotes.sub('', element)
if self._use_friendly_names:
self._names.append(self.alias(element))
else:
self._names.append(element)
subpattern = '(\S*)'
if hasquotes:
if element == '%r' or findreferreragent.search(element):
subpattern = r'\"([^"\\]*(?:\\.[^"\\]*)*)\"'
else:
subpattern = r'\"([^\"]*)\"'
elif findpercent.search(element):
subpattern = r'(\[[^\]]+\])'
elif element == '%U':
subpattern = '(.+?)'
subpatterns.append(subpattern)
self._pattern = '^' + ' '.join(subpatterns) + '$'
try:
self._regex = re.compile(self._pattern)
except Exception, e:
raise ApacheLogParserError(e)
def parse(self, line):
"""
Parses a single line from the log file and returns
a dictionary of it's contents.
Raises and exception if it couldn't parse the line
"""
line = line.strip()
match = self._regex.match(line)
if match:
data = AttrDict()
for k, v in zip(self._names, match.groups()):
data[k] = v
return data
raise ApacheLogParserError("Unable to parse: %s with the %s regular expression" % ( line, self._pattern ) )
def alias(self, name):
"""
Override / replace this method if you want to map format
field names to something else. This method is called
when the parser is constructed, not when actually parsing
a log file
For custom format names, such as %{Foobar}C, 'Foobar' is referred to
(in this function) as the custom_format and '%{}C' as the name
If the custom_format has a '-' in it (and is not a time format), then the
'-' is replaced with a '_' so the name remains a valid identifier.
Takes and returns a string fieldname
"""
custom_format = ''
if name.startswith('%{'):
custom_format = '_' + name[2:-2]
name = '%{}' + name[-1]
if name != '%{}t':
custom_format = custom_format.replace('-', '_')
try:
return self.format_to_name[name] + custom_format
except KeyError:
return name
def pattern(self):
"""
Returns the compound regular expression the parser extracted
from the input format (a string)
"""
return self._pattern
def names(self):
"""
Returns the field names the parser extracted from the
input format (a list)
"""
return self._names
months = {
'Jan':'01',
'Feb':'02',
'Mar':'03',
'Apr':'04',
'May':'05',
'Jun':'06',
'Jul':'07',
'Aug':'08',
'Sep':'09',
'Oct':'10',
'Nov':'11',
'Dec':'12'
}
def parse_date(date):
"""
Takes a date in the format: [05/Dec/2006:10:51:44 +0000]
(including square brackets) and returns a two element
tuple containing first a timestamp of the form
YYYYMMDDHH24IISS e.g. 20061205105144 and second the
timezone offset as is e.g.;
parse_date('[05/Dec/2006:10:51:44 +0000]')
>> ('20061205105144', '+0000')
It does not attempt to adjust the timestamp according
to the timezone - this is your problem.
"""
date = date[1:-1]
elems = [
date[7:11],
months[date[3:6]],
date[0:2],
date[12:14],
date[15:17],
date[18:20],
]
return (''.join(elems),date[21:])
"""
Frequenty used log formats stored here
"""
formats = {
# Common Log Format (CLF)
'common':r'%h %l %u %t \"%r\" %>s %b',
# Common Log Format with Virtual Host
'vhcommon':r'%v %h %l %u %t \"%r\" %>s %b',
# NCSA extended/combined log format
'extended':r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"',
}
if __name__ == '__main__':
import unittest
class TestApacheLogParser(unittest.TestCase):
def setUp(self):
self.format = r'%h %l %u %t \"%r\" %>s '\
r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
self.fields = '%h %l %u %t %r %>s %b %{Referer}i '\
'%{User-Agent}i'.split(' ')
self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
'(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
r'Gecko/20021202"'
self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
r'Gecko/20021202"'
self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
r'YPC 3.0.3; yplus 4.0.00d)\""'
# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
# r'YPC 3.0.3; yplus 4.0.00d)"'
self.p = parser(self.format)
def testpattern(self):
self.assertEqual(self.pattern, self.p.pattern())
def testnames(self):
self.assertEqual(self.fields, self.p.names())
def testline1(self):
data = self.p.parse(self.line1)
self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 1 %h')
self.assertEqual(data['%l'], '-', msg = 'Line 1 %l')
self.assertEqual(data['%u'], '-', msg = 'Line 1 %u')
self.assertEqual(data['%t'], '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 %t')
self.assertEqual(
data['%r'],
'GET /images/previous.png HTTP/1.1',
msg = 'Line 1 %r'
)
self.assertEqual(data['%>s'], '200', msg = 'Line 1 %>s')
self.assertEqual(data['%b'], '2607', msg = 'Line 1 %b')
self.assertEqual(
data['%{Referer}i'],
'http://peterhi.dyndns.org/bandwidth/index.html',
msg = 'Line 1 %{Referer}i'
)
self.assertEqual(
data['%{User-Agent}i'],
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
msg = 'Line 1 %{User-Agent}i'
)
def testline2(self):
data = self.p.parse(self.line2)
self.assertEqual(data['%h'], '212.74.15.68', msg = 'Line 2 %h')
self.assertEqual(data['%l'], '-', msg = 'Line 2 %l')
self.assertEqual(data['%u'], '-', msg = 'Line 2 %u')
self.assertEqual(
data['%t'],
'[23/Jan/2004:11:36:20 +0000]',
msg = 'Line 2 %t'
)
self.assertEqual(
data['%r'],
r'GET /images/previous.png=\" HTTP/1.1',
msg = 'Line 2 %r'
)
self.assertEqual(data['%>s'], '200', msg = 'Line 2 %>s')
self.assertEqual(data['%b'], '2607', msg = 'Line 2 %b')
self.assertEqual(
data['%{Referer}i'],
'http://peterhi.dyndns.org/bandwidth/index.html',
msg = 'Line 2 %{Referer}i'
)
self.assertEqual(
data['%{User-Agent}i'],
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
msg = 'Line 2 %{User-Agent}i'
)
def testline3(self):
data = self.p.parse(self.line3)
self.assertEqual(data['%h'], '4.224.234.46', msg = 'Line 3 %h')
self.assertEqual(data['%l'], '-', msg = 'Line 3 %l')
self.assertEqual(data['%u'], '-', msg = 'Line 3 %u')
self.assertEqual(
data['%t'],
'[20/Jul/2004:13:18:55 -0700]',
msg = 'Line 3 %t'
)
self.assertEqual(
data['%r'],
r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
r'HTTP/1.1',
msg = 'Line 3 %r'
)
self.assertEqual(data['%>s'], '200', msg = 'Line 3 %>s')
self.assertEqual(data['%b'], '2888', msg = 'Line 3 %b')
self.assertEqual(
data['%{Referer}i'],
r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
r'%20bimini\"',
msg = 'Line 3 %{Referer}i'
)
self.assertEqual(
data['%{User-Agent}i'],
'\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
'yplus 4.0.00d)\\"',
# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
# 'yplus 4.0.00d)',
msg = 'Line 3 %{User-Agent}i'
)
def testjunkline(self):
self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
def testhasquotesaltn(self):
p = parser(r'%a \"%b\" %c')
line = r'foo "xyz" bar'
data = p.parse(line)
self.assertEqual(data['%a'],'foo', '%a')
self.assertEqual(data['%b'],'xyz', '%c')
self.assertEqual(data['%c'],'bar', '%c')
def testparsedate(self):
date = '[05/Dec/2006:10:51:44 +0000]'
self.assertEqual(('20061205105144','+0000'),parse_date(date))
class TestApacheLogParserFriendlyNames(unittest.TestCase):
def setUp(self):
self.format = r'%h %l %u %t \"%r\" %>s '\
r'%b \"%{Referer}i\" \"%{User-Agent}i\"'
self.fields = ('remote_host remote_logname remote_user time '
'first_line last_status response_bytes_clf '
'header_Referer header_User_Agent').split(' ')
self.pattern = '^(\\S*) (\\S*) (\\S*) (\\[[^\\]]+\\]) '\
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
'(\\S*) (\\S*) \\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\" '\
'\\\"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)\\\"$'
self.line1 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
r'"GET /images/previous.png HTTP/1.1" 200 2607 '\
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
r'Gecko/20021202"'
self.line2 = r'212.74.15.68 - - [23/Jan/2004:11:36:20 +0000] '\
r'"GET /images/previous.png=\" HTTP/1.1" 200 2607 '\
r'"http://peterhi.dyndns.org/bandwidth/index.html" '\
r'"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) '\
r'Gecko/20021202"'
self.line3 = r'4.224.234.46 - - [20/Jul/2004:13:18:55 -0700] '\
r'"GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked'\
r'_boats=1176818&slim=broker&&hosturl=giffordmarine&&ywo='\
r'giffordmarine& HTTP/1.1" 200 2888 "http://search.yahoo.com/'\
r'bin/search?p=\"grady%20white%20306%20bimini\"" '\
r'"\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
r'YPC 3.0.3; yplus 4.0.00d)\""'
# r'"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; '\
# r'YPC 3.0.3; yplus 4.0.00d)"'
self.p = parser(self.format, True)
def testpattern(self):
self.assertEqual(self.pattern, self.p.pattern())
def testnames(self):
self.assertEqual(self.fields, self.p.names())
def testline1(self):
data = self.p.parse(self.line1)
self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 1 remote_host')
self.assertEqual(data.remote_logname, '-', msg = 'Line 1 remote_logname')
self.assertEqual(data.remote_user, '-', msg = 'Line 1 remote_user')
self.assertEqual(data.time, '[23/Jan/2004:11:36:20 +0000]', msg = 'Line 1 time')
self.assertEqual(
data.first_line,
'GET /images/previous.png HTTP/1.1',
msg = 'Line 1 first_line'
)
self.assertEqual(data.last_status, '200', msg = 'Line 1 last_status')
self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 1 response_bytes_clf')
self.assertEqual(
data.header_Referer,
'http://peterhi.dyndns.org/bandwidth/index.html',
msg = 'Line 1 %{Referer}i'
)
self.assertEqual(
data.header_User_Agent,
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
msg = 'Line 1 %{User-Agent}i'
)
def testline2(self):
data = self.p.parse(self.line2)
self.assertEqual(data.remote_host, '212.74.15.68', msg = 'Line 2 remote_host')
self.assertEqual(data.remote_logname, '-', msg = 'Line 2 remote_logname')
self.assertEqual(data.remote_user, '-', msg = 'Line 2 remote_user')
self.assertEqual(
data.time,
'[23/Jan/2004:11:36:20 +0000]',
msg = 'Line 2 time'
)
self.assertEqual(
data.first_line,
r'GET /images/previous.png=\" HTTP/1.1',
msg = 'Line 2 first_line'
)
self.assertEqual(data.last_status, '200', msg = 'Line 2 last_status')
self.assertEqual(data.response_bytes_clf, '2607', msg = 'Line 2 response_bytes_clf')
self.assertEqual(
data.header_Referer,
'http://peterhi.dyndns.org/bandwidth/index.html',
msg = 'Line 2 %{Referer}i'
)
self.assertEqual(
data.header_User_Agent,
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2) Gecko/20021202',
msg = 'Line 2 %{User-Agent}i'
)
def testline3(self):
data = self.p.parse(self.line3)
self.assertEqual(data.remote_host, '4.224.234.46', msg = 'Line 3 remote_host')
self.assertEqual(data.remote_logname, '-', msg = 'Line 3 remote_logname')
self.assertEqual(data.remote_user, '-', msg = 'Line 3 remote_user')
self.assertEqual(
data.time,
'[20/Jul/2004:13:18:55 -0700]',
msg = 'Line 3 time'
)
self.assertEqual(
data.first_line,
r'GET /core/listing/pl_boat_detail.jsp?&units=Feet&checked_boats='\
r'1176818&slim=broker&&hosturl=giffordmarine&&ywo=giffordmarine& '\
r'HTTP/1.1',
msg = 'Line 3 first_line'
)
self.assertEqual(data.last_status, '200', msg = 'Line 3 last_status')
self.assertEqual(data.response_bytes_clf, '2888', msg = 'Line 3 response_bytes_clf')
self.assertEqual(
data.header_Referer,
r'http://search.yahoo.com/bin/search?p=\"grady%20white%20306'\
r'%20bimini\"',
msg = 'Line 3 %{Referer}i'
)
self.assertEqual(
data.header_User_Agent,
'\\"Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
'yplus 4.0.00d)\\"',
# 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; YPC 3.0.3; '\
# 'yplus 4.0.00d)',
msg = 'Line 3 %{User-Agent}i'
)
def testjunkline(self):
self.assertRaises(ApacheLogParserError,self.p.parse,'foobar')
def testhasquotesaltn(self):
p = parser(r'%a \"%b\" %c')
line = r'foo "xyz" bar'
data = p.parse(line)
self.assertEqual(data['%a'],'foo', '%a')
self.assertEqual(data['%b'],'xyz', '%c')
self.assertEqual(data['%c'],'bar', '%c')
def testparsedate(self):
date = '[05/Dec/2006:10:51:44 +0000]'
self.assertEqual(('20061205105144','+0000'),parse_date(date))
unittest.main()