Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get simple tables working #3

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
90 changes: 81 additions & 9 deletions html2docx/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@ def xml(self):
return cElementTree.tostring(self.tree)


class ParagraphParser(object):
html_to_ooxml_tag_conversions = {
'strong': 'bold',
'em': 'italics',
}
class BaseParser(object):
abstract = True

def __init__(self, element):
self.element = element
Expand Down Expand Up @@ -39,17 +36,20 @@ def _parse(self, element, styles):
if element.tail:
yield element.tail, styles[-1]


class ParagraphParser(BaseParser):
html_to_ooxml_tag_conversions = {
'strong': 'bold',
'em': 'italics',
}

def build_runs(self):
for text, styles in self.parse(self.element):
run = Run(text)
for style in styles:
ooxml_style = self.html_to_ooxml_tag_conversions.get(style)
if ooxml_style:
setattr(run.properties, ooxml_style, True)
if 'strong' in styles:
run.properties.bold = True
if 'em' in styles:
run.properties.italics = True
yield run

@property
Expand Down Expand Up @@ -130,3 +130,75 @@ def italics(self, value):
self._italics = True
else:
self._italics = False


class TableParser(BaseParser):
@property
def tag(self):
table_rows = []
for table_row in self.element.findall('tr'):
table_rows.append(TableRowParser(table_row))
return Table(table_rows)


class Table(BaseTag):
tag_name = 'w:tbl'

def __init__(self, table_rows=None):
self.table_rows = table_rows

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.table_rows is None:
return element
for table_row in self.table_rows:
element.append(table_row.tag.tree)
return element


class TableRowParser(BaseParser):
@property
def tag(self):
table_cells = []
for table_cell in self.element.findall('td'):
table_cells.append(TableCellParser(table_cell))
return TableRow(table_cells)


class TableRow(BaseTag):
tag_name = 'w:tr'

def __init__(self, table_cells=None):
self.table_cells = table_cells

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.table_cells is None:
return element
for table_cell in self.table_cells:
element.append(table_cell.tag.tree)
return element


class TableCellParser(BaseParser):
@property
def tag(self):
paragraph = ParagraphParser(self.element)
return TableCell(paragraph)


class TableCell(BaseTag):
tag_name = 'w:tc'

def __init__(self, element=None):
self.element = element

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.element is None:
return element
element.append(self.element.tag.tree)
return element
13 changes: 10 additions & 3 deletions html2docx/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from jinja2 import Environment, PackageLoader

from html2docx.utils import ZipFile
from html2docx.builder import ParagraphParser
from html2docx.builder import ParagraphParser, TableParser


tag_to_parser_conversions = {
'p': ParagraphParser,
'table': TableParser
}


class HTML2Docx(object):
Expand Down Expand Up @@ -47,8 +53,9 @@ def _convert(self):
if el in self.visited:
continue
self.visited.update([el])
if el.tag == 'p':
parser = ParagraphParser(el)
Parser = tag_to_parser_conversions.get(el.tag)
if Parser:
parser = Parser(el)
self.document_state.append(parser.tag)
self.visited.update(el.getiterator())

Expand Down
3 changes: 3 additions & 0 deletions html2docx/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class TestDocx2Html(Docx2Html):
def style(*args, **kwargs):
return ''

def table(self, text):
return '<table>%s</table>' % text


def build_run(test_name, html):
boiler_plate = '<html><head></head><body>%s</body></html>'
Expand Down
109 changes: 108 additions & 1 deletion html2docx/tests/test_builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
from xml.etree import cElementTree
from unittest import TestCase

from html2docx.builder import RunProperties, ParagraphParser, Paragraph
from html2docx.builder import (
Paragraph,
ParagraphParser,
RunProperties,
Table,
TableCell,
TableCellParser,
TableParser,
TableRow,
TableRowParser,
)


class RunPropertiesTestCase(TestCase):
Expand Down Expand Up @@ -75,3 +85,100 @@ def test_empty(self):

xml = paragraph.xml
self.assertEqual(xml, expected_xml)


class TableCellParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<td>AAA</td>')
parser = TableCellParser(element)
xml = parser.tag.xml
expected_xml = '<w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<td><strong>AAA</strong></td>')
parser = TableCellParser(element)
xml = parser.tag.xml
expected_xml = '<w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc>' # noqa

self.assertEqual(xml, expected_xml)


class TableCellTestCase(TestCase):
def test_empty(self):
table_cell = TableCell()
expected_xml = '<w:tc />'

xml = table_cell.xml
self.assertEqual(xml, expected_xml)


class TableRowParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<tr><td>AAA</td></tr>')
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<tr><td><strong>AAA</strong></td></tr>') # noqa
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)

def test_multiple_cells(self):
element = cElementTree.fromstring('<tr><td>AAA</td><td>BBB</td></tr>')
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>BBB</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)


class TableRowTestCase(TestCase):
def test_empty(self):
table_row = TableRow()
expected_xml = '<w:tr />'

xml = table_row.xml
self.assertEqual(xml, expected_xml)


class TableParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<table><tr><td>AAA</td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<table><tr><td><strong>AAA</strong></td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)

def test_multiple_cells(self):
element = cElementTree.fromstring('<table><tr><td>AAA</td><td>BBB</td></tr><tr><td>CCC</td><td>DDD</td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>BBB</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>CCC</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>DDD</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)


class TableTestCase(TestCase):
def test_empty(self):
table_row = Table()
expected_xml = '<w:tbl />'

xml = table_row.xml
self.assertEqual(xml, expected_xml)
24 changes: 24 additions & 0 deletions html2docx/tests/test_complex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from html2docx.tests import build_run


test_cases = [
(
'Test paragraph, table, paragraph.',
'<p>AAA</p><table><tr><td>BBB</td></tr></table><p>CCC</p>',
),
(
'Test table, table, paragraph',
'<table><tr><td>AAA</td></tr></table><table><tr><td>BBB</td></tr></table><p>CCC</p>', # noqa
),
# Nesting doesn't really work yet.
# (
# 'Test Nested Table',
# '<table><tr><td>AAA</td><td><table><tr><td>BBB</td></tr></table></td></tr></table>', # noqa
# ),
]


def test():
for test_name, html in test_cases:
run = build_run(test_name, html)
yield run
27 changes: 27 additions & 0 deletions html2docx/tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from html2docx.tests import build_run


test_cases = [
(
'Test simple table.',
'<table><tr><td>AAA</td></tr></table>',
),
(
'Test multiple rows.',
'<table><tr><td>AAA</td></tr><tr><td>BBB</td></tr></table>',
),
(
'Test multiple cells.',
'<table><tr><td>AAA</td><td>BBB</td></tr></table>',
),
(
'Test multiple rows and cells.',
'<table><tr><td>AAA</td><td>BBB</td></tr><tr><td>CCC</td><td>DDD</td></tr></table>', # noqa
),
]


def test():
for test_name, html in test_cases:
run = build_run(test_name, html)
yield run
4 changes: 1 addition & 3 deletions run_tests.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#! /bin/sh

RUN_TESTS='nosetests -v -v --with-coverage --cover-erase --cover-package=. html2docx'
echo $RUN_TESTS
$RUN_TESTS
nosetests -v -v --with-coverage --cover-erase --cover-package=html2docx html2docx && find -name '*.py' | xargs flake8