Skip to content

Commit

Permalink
update_tmserver to truncate IDs to elasticsearch's 512 byte limit
Browse files Browse the repository at this point in the history
and allow for utf8 in description
and allow for nonexisting submitted_by__email
  • Loading branch information
cloph committed Jan 24, 2018
1 parent 66ced4d commit 8e52d92
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions pootle/apps/pootle_app/management/commands/update_tmserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_unit_data(self, unit):
unit['change__submitted_by__username'])

email_md5 = None
if unit['submitted_by__email']:
if unit.get('submitted_by__email', False):
email_md5 = md5(
force_bytes(unit['change__submitted_by__email'])).hexdigest()

Expand Down Expand Up @@ -166,16 +166,34 @@ def get_units(self):

return units, len(units)

# truncate to bytesize adapted from https://stackoverflow.com/a/13738452
def utf8_lead_byte(self, b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
return (ord(b) & 0xC0) != 0x80

def utf8_byte_truncate(self, text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return text
i = max_bytes
while i > 0 and not self.utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i].decode('utf8')

def get_unit_data(self, unit):
"""Return dict with data to import for a single unit."""
target_language = unit.gettargetlanguage()
if target_language is None:
target_language = self.target_language

# truncate to 512 since elasticsearch ids are limited to that size...
# simple [:512] won't do, as utf-8 encoding can create more bytes
return {
'_index': self.INDEX_NAME,
'_type': target_language,
'_id': unit.getid(),
'_id': self.utf8_byte_truncate(unit.getid(), 512),
'revision': 0,
'project': self.project,
'path': self.filename,
Expand Down Expand Up @@ -321,7 +339,7 @@ def _initialize(self, **options):
self.parser = FileParser(stdout=self.stdout, index=self.INDEX_NAME,
filenames=options['files'],
language=options['target_language'],
project=options['project'])
project=options['project'].decode('utf8'))
elif not self.is_local_tm:
raise CommandError('You cannot add translations from database to '
'an external TM.')
Expand Down

0 comments on commit 8e52d92

Please sign in to comment.