Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gsub_diff: Compare LookupTypes 2, 3, 4, 7 #455

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 129 additions & 26 deletions nototools/gsub_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,41 +44,144 @@ def __init__(self, file_a, file_b, output_lines=20):

def find_gsub_diffs(self):
"""Report differences in substitution rules."""

rules_a = self._get_gsub_rules(self.text_a, self.file_a)
rules_b = self._get_gsub_rules(self.text_b, self.file_b)

diffs = []
report = [''] # first line replaced by difference count
for rule in rules_a:
if rule not in rules_b:
diffs.append(('-',) + rule)
for rule in rules_b:
if rule not in rules_a:
diffs.append(('+',) + rule)
# ('+', 'smcp', 'Q', 'Q.sc')
new = [self._format_rule(r, '+') for r in self.find_new_rules()]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that refactoring those two for loops into new find_new_rules and find_missing_rules methods make the code more readable, but you are calling the same _get_gsub_rules method twice now, which may be an expensive operation. Perhaps you could compute rules_a and rules_b once and pass them as arguments to the previous methods.. but then again the methods would just become a label for a list comprehension (readable enough from POV). Well, up to you.

missing = [self._format_rule(r, '-') for r in self.find_missing_rules()]
diffs = missing + new
# ('+', 'smcp', 'Q', 'by', Q.sc')
# Sort order:
# 1. Feature tag
# 2. Glyph name before substitution
# 3. Glyph name after substitution
diffs.sort(key=lambda t:(t[1], t[2], t[3]))
diffs.sort(key=lambda t:(t[1], t[2], t[4]))
report = ['%d differences in GSUB rules' % len(diffs)]
report.extend(' '.join(diff) for diff in diffs)
return '\n'.join(report[:self.output_lines + 1])

def _get_gsub_rules(self, text, filename):
"""Get substitution rules in this ttxn output."""
def find_new_rules(self):
rules_a = self._get_gsub_rules(self.text_a, self.file_a)
rules_b = self._get_gsub_rules(self.text_b, self.file_b)
return [r for r in rules_b if r not in rules_a]

def find_missing_rules(self):
rules_a = self._get_gsub_rules(self.text_a, self.file_a)
rules_b = self._get_gsub_rules(self.text_b, self.file_b)
return [r for r in rules_a if r not in rules_b]

def _get_gsub_rules(self, text, file):
"""
Parse the ttxn GSUB table in the following manner:

1. Get features
2. Get feature content
3. Extract lookup rules from feature content

Following substitutions are currently implemented:
- Type 1: Single substitutions
- Type 2: Multiple substitutions
- Type 3: Alternate substitutions
- Type 4: Ligature substitutionss

TODO: LookupTypes 5, 6, 8 still need implementing
"""
rules = []
features = self._get_gsub_features(text)
for feature in features:
content = self._get_feature_content(text, feature)
lookups_rules = self._get_lookups_rules(text, content[0], feature)
rules += lookups_rules
return rules

def _get_gsub_features(self, text):
features = set()
feature_name_rx = r'feature (\w+) {'
contents_rx = r'feature %s {(.*?)} %s;'
rule_rx = r'sub ([\w.]+) by ([\w.]+);'

rules = set()
for name in re.findall(feature_name_rx, text):
contents = re.findall(contents_rx % (name, name), text, re.S)
assert len(contents) == 1, 'Multiple %s features in %s' % (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the previous code would assert that only one feature block with a given name exists (which might not be true, at least for handwritten, non-ttxn-generated, feature files, since the spec allows multiple feature blocks with same name).
In your _get_feature_content below you have removed the assert and simply returning the result of re.findall, which may contain more than one match. But you are only retaining the first one, since you do contents[0] later on.
Was this intentional?
If we only support reading lookups from the first feature block that matches, then we need to assert that there's only one. Probably, it would be better to collect lookup rules for all the feature blocks with that name.

name, filename)
contents = contents[0]
for lhs, rhs in re.findall(rule_rx, contents):
rules.add((name, lhs, rhs))
return rules
features.add(name)
return list(features)

def _get_feature_content(self, text, feature):
contents_rx = r'feature %s {(.*?)} %s;'
contents = re.findall(contents_rx % (feature, feature), text, re.S)
return contents

def _get_lookups_rules(self, text, content, feature):
"""Ignore rules which use "'". These are contextual and not in
lookups 1-4"""
rule_rx = r"[^C] sub (.*[^\']) (by|from) (.*);"
rules = re.findall(rule_rx, content)
parsed_rules = self._parse_gsub_rules(rules, feature)
return parsed_rules

def _parse_gsub_rules(self, rules, feature):
"""
Parse GSUB sub LookupTypes 1, 2, 3, 4, 7. Return list of tuples with
the following tuple sequence.

(feature, [input glyphs], operator, [output glyphs])

Type 1 Single Sub:
sub a by a.sc;
sub b by b.sc;
[
(feat, ['a'], 'by' ['a.sc']),
(feat, ['b'], 'by' ['b.cs'])
]


Type 2 Multiple Sub:
sub f_f by f f;
sub f_f_i by f f i;
[
(feat, ['f_f'], 'by', ['f', 'f']),
(feat, ['f_f_i'], 'by', ['f', 'f', 'i'])
]

Type 3 Alternative Sub:
sub ampersand from [ampersand.1 ampersand.2 ampersand.3];
[
(feat, ['ampersand'], 'from', ['ampersand.1']),
(feat, ['ampersand'], 'from', ['ampersand.2']),
(feat, ['ampersand'], 'from', ['ampersand.3'])
]

Type 4 Ligature Sub:
sub f f by f_f;
sub f f i by f_f_i;
[
(feat, ['f', 'f'] 'by' ['f_f]),
(feat, ['f', 'f', 'i'] 'by' ['f_f_i'])
]

http://www.adobe.com/devnet/opentype/afdko/topic_feature_file_syntax.html#4.e
"""
parsed = []
for idx, (left, op, right) in enumerate(rules):

left_group, right_group = [], []
if left.startswith('[') and left.endswith(']'):
left = self._gsub_rule_group_to_string(left)

if right.startswith('[') and right.endswith(']'):
right = self._gsub_rule_group_to_string(right)

if op == 'by': # parse LookupType 1, 2, 4
parsed.append((feature, left.split(), op, right.split()))
elif op == 'from': # parse LookupType 3
for glyph in right.split(): # 'a.alt a.sc' -> ['a.alt', 'a.sc']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment is redundant

parsed.append((feature, left.split(), op, [glyph]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

left.split() shouldn't be necessary since in lookuptype 3 alternate substitutions the format is substitute <glyph> from <glyphclass>; so the left slot can contain only one glyph. A left.strip() would do.

return parsed

def _format_rule(self, rule, sign):
"""Unnest the tuple rule sequence to more report friendly format"""
s = [sign]
for item in rule:
if not isinstance(item, str):
for sub_item in item:
s.append(sub_item)
else:
s.append(item)
return s

def _gsub_rule_group_to_string(self, seq):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find defining a method a bit overkill for such a trivial operation as seq[1:-1], but I don't mind.

"""[a a.sc a.sups] --> 'a a.sc a.sups'"""
return seq[1:-1]
174 changes: 174 additions & 0 deletions tests/gsub_diff_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Tests for gsub_diff module. Test examples for each LookupType taken from
the Adobe fea spec:
http://www.adobe.com/devnet/opentype/afdko/topic_feature_file_syntax.html#5.b
"""

import tempfile
import unittest
from nototools.gsub_diff import GsubDiffFinder
from hb_input_test import make_font


class GposDiffFinderText(unittest.TestCase):
def _expect_gsub_diffs(self, source_a, source_b, pairs):
font_a = make_font('feature ccmp {\n%s\n} ccmp;' % source_a)
font_b = make_font('feature ccmp {\n%s\n} ccmp;' % source_b)
file_a = tempfile.NamedTemporaryFile()
file_b = tempfile.NamedTemporaryFile()
font_a.save(file_a.name)
font_b.save(file_b.name)
finder = GsubDiffFinder(file_a.name, file_b.name)

diffs = finder.find_gsub_diffs()
self.assertIn('%d differences in GSUB rules' % len(pairs), diffs)
for pair_diff in pairs:
self.assertIn(pair_diff, diffs)

def test_type1_gsub_1(self):
"""Test LookupType 1 Single substitutions"""
self._expect_gsub_diffs('''
sub A by A.sc;
sub B by B.sc;
''', '''
sub A by A.sc;
''',
[('- ccmp B by B.sc')])

def test_type1_gsub_2(self):
"""Test LookupType 1 Single substitutions on groups"""
self._expect_gsub_diffs('''
sub [A B] by [A.sc B.sc];
''', '''
sub [A] by [A.sc];
''',
[('- ccmp B by B.sc')])

def test_type2_gsub(self):
"""Test LookupType 2 Multiple substitutions"""
self._expect_gsub_diffs('''
sub f_l by f l;
''', '''
sub f_l by f l;
sub c_h by c h;
''',
[('+ ccmp c_h by c h')])

def test_type3_gsub(self):
"""Test LookupType 3 Alternate substitutions"""
self._expect_gsub_diffs('''
sub A from [A.swash A.sc];
''', '''
sub A from [A.swash A.sc];
sub B from [B.swash B.sc];
''',
[('+ ccmp B from B.swash'),
('+ ccmp B from B.sc')])

def test_type4_gsub_1(self):
"""Test LookupType 4 Ligature substitutions"""
self._expect_gsub_diffs('''
sub f l by f_l;
sub c h by c_h;
''', '''
sub f l by f_l;
''',
[('- ccmp c h by c_h')])

def test_type4_gsub_2(self):
"""Test LookupType 4 Ligature substitutions on groups"""
self._expect_gsub_diffs('''
sub [f F.swash] [l L.swash] by f_l;
''', '''
sub [f] [l] by f_l;
''',
[('- ccmp F.swash L.swash by f_l'),
('- ccmp F.swash l by f_l'),
('- ccmp f L.swash by f_l'),
])

def test_type5_and_6_gsub_1(self):
"""LookupType 5 and 6 not implemented, make sure it returns nothing.

This lookupType can use other lookups so include them in the test"""
self._expect_gsub_diffs('''
lookup CNTXT_LIGS {
sub c t by c_t;
} CNTXT_LIGS;

lookup CNTXT_SUB {
sub s by s.end;
} CNTXT_SUB;

# LookupType 6 implementation
lookup test {
sub [ a e i o u] c' lookup CNTXT_LIGS t' s' lookup CNTXT_SUB;
} test;
''','''
lookup CNTXT_LIGS {
sub c t by c_t;
} CNTXT_LIGS;

lookup CNTXT_SUB {
sub s by s.end;
} CNTXT_SUB;
''',
[])

def test_type5_and_6_gsub_2(self):
"""LookupType 5 and 6 not implemented, make sure it returns nothing.
"""
self._expect_gsub_diffs('''
substitute [a e n] d' by d.alt;
''','''
''',
[])

def test_type5_and_6_gsub_3(self):
"""LookupType 5 and 6 not implemented, make sure it returns nothing.
"""
self._expect_gsub_diffs('''
substitute [e e.begin]' t' c by ampersand;
''','''
''',
[])

def test_type7_gsub(self):
"""Test LookupType 7 Extension substitution"""
self._expect_gsub_diffs('''
lookup fracbar useExtension {
sub slash by fraction;
} fracbar;
''','''
lookup fracbar useExtension {
# missing rules
} fracbar;
''',
[('- ccmp slash by fraction')])

def test_type8_gsub(self):
"""LookupType 8 not implemented, make sure it returns nothing"""
self._expect_gsub_diffs('''
reversesub [a e n] d' by d.alt;
''','''

''',
[])


if __name__ == '__main__':
unittest.main()