-
Notifications
You must be signed in to change notification settings - Fork 1
/
JK.py
160 lines (142 loc) · 7.03 KB
/
JK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
import sys,os
import re
from datetime import datetime, date
from rdflib import Namespace, URIRef, Literal, RDF
from rdflib.Graph import Graph
from DocumentRepository import DocumentRepository
from DataObjects import UnicodeStructure, CompoundStructure, \
MapStructure, IntStructure, DateStructure, PredicateType, \
UnicodeSubject, Stycke, Sektion, \
serialize
import Util
import LegalURI
from LegalRef import LegalRef, Link
__version__ = (1,6)
__author__ = u"Staffan Malmgren <[email protected]>"
class JK(DocumentRepository):
module_dir = "jk"
start_url = "http://www.jk.se/beslut/default.asp"
document_url = "http://www.jk.se/beslut/XmlToHtml.asp?XML=Files/%s.xml&XSL=../xsl/JK_Beslut.xsl"
def download_everything(self,cache=False):
self.browser.open(self.start_url)
for avd in (self.browser.links(url_regex=r'Default.asp\?Type=\d+')):
self.log.info(u"Retrieving section '%s'" % avd.text.decode('iso-8859-1'))
self.browser.follow_link(avd)
url = None
for dok in (self.browser.links(url_regex=r'XmlToHtml.asp\?XML=Files/\d+\w*-\d+-\d+')):
m = re.search("(\d+\w*-\d+-\d+)",dok.url)
if m.group(1) != url:
url = m.group(1)
self.download_single(url,cache)
def parse_from_soup(self,soup):
# Step 1: Find out basic metadata
rubrik = soup.first("title").string
beslutsdatum = soup.first("meta",{'name':'SG_Beslutsdatum'})['content']
# Converting this into a proper date object makes the RDFa
# statement use a typed literal (xsd:date), which is nice, but
# the currently released pyRdfa package doesn't support this
beslutsdatum = datetime.strptime(beslutsdatum,"%Y-%m-%d").date()
diarienummer = soup.first("meta",{'name':'SG_Dokumentbet'})['content']
arendetyp = soup.first("meta",{'name':'Subject'})['content']
# the keywords for a documents is contained in a metatag
# formatted like:
# <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare">
#
# Transform this into an array like:
# [u'http://lagen.nu/concept/Hets_mot_folkgrupp',
# u'http://lagen.nu/concept/Meddelarfrihet',
# u'http://lagen.nu/concept/Åklagare']
nyckelord = soup.first("meta",{'name':'Keywords'})['content']
begrepp = [u'http://lagen.nu/concept/%s' % Util.ucfirst(x).strip().replace(" ","_") for x in nyckelord.split("\n")]
# Step 2: Using the metadata, construct the canonical URI for this document
uri = LegalURI.construct({'type':LegalRef.MYNDIGHETSBESLUT,
'myndighet':'jk',
'dnr':diarienummer})
# self.log.debug("URI: %s" % uri)
# Step 3: Create a RDF graph of all our metadata (so far)
g = Graph()
g.bind('dct',self.ns['dct'])
g.bind('rinfo',self.ns['rinfo'])
g.bind('rinfoex',self.ns['rinfoex'])
g.bind('xsd',Util.ns['xsd'])
g.add((URIRef(uri),self.ns['dct']['title'],Literal(rubrik,lang="sv")))
g.add((URIRef(uri),self.ns['rinfo']['beslutsdatum'],Literal(beslutsdatum,lang="sv")))
g.add((URIRef(uri),self.ns['rinfo']['diarienummer'],Literal(diarienummer,lang="sv")))
g.add((URIRef(uri),self.ns['rinfoex']['arendetyp'],Literal(arendetyp,lang="sv")))
for s in begrepp:
g.add((URIRef(uri),self.ns['dct']['subject'],URIRef(s)))
g.add((URIRef(uri),self.ns['dct']['identifier'], Literal("JK %s" % diarienummer, lang="sv")))
g.add((URIRef(uri),RDF.type, self.rdf_type))
# Step 4: Process the actual text of the document
self.parser = LegalRef(LegalRef.LAGRUM,
LegalRef.KORTLAGRUM,
LegalRef.RATTSFALL,
LegalRef.FORARBETEN)
# newer documents have a semantic structure with h1 and h2
# elements. Older have elements like <p class="Rubrik_1">. Try
# to determine which one we're dealing with?
tag = soup.find('a', {'name':"Start"})
if tag:
# self.log.debug("Using new-style document structure")
elements = tag.parent.findAllNext()
else:
# self.log.debug("Using old-style document structure")
elements = soup.findAll("p")
# self.log.debug("Found %d elements" % len(elements))
from collections import deque
elements = deque(elements)
body = self.make_sektion(elements,u"Referat av beslut")
# Step 5: Combine the metadata and the document, and return it
doc = {'meta':g,
'body':body,
'lang':'sv',
'uri':uri}
return doc
def make_sektion(self,elements,heading,level=0):
sekt = Sektion(**{"rubrik":heading,
"niva":level})
self.log.debug("%sCreated sektion(%d): '%s'" % (" "*level,level,heading))
baseuri = None
while True:
try:
p = elements.popleft()
except IndexError:
return sekt
text = Util.elementText(p)
# self.log.debug("%sp.name: %s, p['class']: %s, 'class' in p.attrs: %s" % (" "*level,p.name,p['class'], (u'class' in p.attrs[0])))
new_level = None
if p.name == "h1":
new_level = 1
elif p.name == "h2":
new_level = 2
elif p.name == "h3":
new_level = 3
elif ((p.name == "p") and
(len(p.attrs) > 0) and
('class' in p.attrs[0]) and
(p['class'].startswith("Rubrik_"))):
# self.log.debug("%sp.class: %s" % (" "*level,p['class']))
new_level = int(p['class'][7:])
if new_level:
if new_level > level:
sekt.append(self.make_sektion(elements,text,new_level))
else:
elements.appendleft(p)
return sekt
else:
if text:
nodes = self.parser.parse(text,
baseuri=baseuri,
predicate="dct:references")
for node in nodes:
# Use possible SFS references as the the
# baseuri for subsequent paragraphs
if isinstance(node,Link) and node.uri.startswith("http://rinfo.lagrummet.se/publ/sfs/"):
baseuri = node.uri
stycke = Stycke(nodes)
# self.log.debug("%sCreated stycke: '%s'" % (" "*level,stycke))
sekt.append(stycke)
if __name__ == "__main__":
JK.run()