Skip to content

Commit

Permalink
update_tmserver to truncate IDs to elasticsearch's 512 byte limit
Browse files Browse the repository at this point in the history
and allow for utf8 in description
  • Loading branch information
cloph committed Jan 24, 2018
1 parent e48d640 commit e551c79
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions pootle/apps/pootle_app/management/commands/update_tmserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,16 +166,34 @@ def get_units(self):

return units, len(units)

# truncate to bytesize adapted from https://stackoverflow.com/a/13738452
def utf8_lead_byte(self, b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
return (ord(b) & 0xC0) != 0x80

def utf8_byte_truncate(self, text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return text
i = max_bytes
while i > 0 and not self.utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i].decode('utf8')

def get_unit_data(self, unit):
"""Return dict with data to import for a single unit."""
target_language = unit.gettargetlanguage()
if target_language is None:
target_language = self.target_language

# truncate to 512 since elasticsearch ids are limited to that size...
# simple [:512] won't do, as utf-8 encoding can create more bytes
return {
'_index': self.INDEX_NAME,
'_type': target_language,
'_id': unit.getid(),
'_id': self.utf8_byte_truncate(unit.getid(), 512),
'revision': 0,
'project': self.project,
'path': self.filename,
Expand Down Expand Up @@ -321,7 +339,7 @@ def _initialize(self, **options):
self.parser = FileParser(stdout=self.stdout, index=self.INDEX_NAME,
filenames=options['files'],
language=options['target_language'],
project=options['project'])
project=options['project'].decode('utf8'))
elif not self.is_local_tm:
raise CommandError('You cannot add translations from database to '
'an external TM.')
Expand Down

0 comments on commit e551c79

Please sign in to comment.