v0.2.0-alpha
What's Changed
- Improve fuzzymatch based whitelist validator by tracking best candidates by @NISH1001 in #40
- Add better whitelist validator using different matching algorithm by @NISH1001 in #41
- Bump up the version to 0.2.0-alpha by @NISH1001 in #42
Full Changelog: v0.1.2-alpha...v0.2.0-alpha
Usages
Usages can be seen in the the docstring of the components. But here's the tentative ones
from openai import OpenAI
from larch.metadata import InstructorBasedOpenAIMetadataExtractor
from larch.metadata.validators import WhitelistBasedMetadataValidatorWithMatcher
from larch.processors import (
CombinedMatcher,
ExactMatcher,
FuzzyMatcher,
CombinedMatcher,
LLMMatcher
)
from larch.utils import load_whitelist
matcher = ExactMatcher()
match = matcher(
text="paradox",
values=["para", "Paradox"]
) # Output [('Paradox', 100.0)]
matcher = FuzzyMatcher()
match = matcher(
text="parado",
values=["para", "Paradox"]
) # [('Paradox', 92.3076923076923), ('para', 90.0)]
matcher = LLMMatcher(
InstructorBasedOpenAIMetadataExtractor(schema=None, openai_client=OpenAI(), model="gpt-3.5-turbo", debug=False),
debug=True,
)
match = matcher(
text="prdx",
values=["para", "Paradox"]
) # Output: [('Paradox', 100.0)]
whitelist_map = load_whitelist(<path_to_excel_file>)
whitelist_map = {"address": {"Huntsville": ["Huntsville", "hunsville", "huntsvil"]}},
metadata = dict(address="hunsvllle")
metadata_validated = WhitelistBasedMetadataValidatorWithMatcher(
whitelists=whitelist_map,
field_matcher=CombinedMatcher(ExactMatcher(), FuzzyMatcher()),
fallback_matcher=LLMMatcher(
InstructorBasedOpenAIMetadataExtractor(
schema=None,
openai_client=OpenAI(), model="gpt-3.5-turbo",
debug=True
),
debug=True,
),
unmatched_value=None # If set to `None`, unmatched keys will be removed
)(metadata) # Output: {"address": "Huntsville"}