Skip to content

v0.2.0-alpha

Compare
Choose a tag to compare
@NISH1001 NISH1001 released this 05 Apr 16:14
· 12 commits to develop since this release
9a54e79

What's Changed

  • Improve fuzzymatch based whitelist validator by tracking best candidates by @NISH1001 in #40
  • Add better whitelist validator using different matching algorithm by @NISH1001 in #41
  • Bump up the version to 0.2.0-alpha by @NISH1001 in #42

Full Changelog: v0.1.2-alpha...v0.2.0-alpha


Usages

Usages can be seen in the the docstring of the components. But here's the tentative ones

from openai import OpenAI

from larch.metadata import InstructorBasedOpenAIMetadataExtractor
from larch.metadata.validators import WhitelistBasedMetadataValidatorWithMatcher

from larch.processors import (
    CombinedMatcher,    
    ExactMatcher,
    FuzzyMatcher,
    CombinedMatcher,
    LLMMatcher
)

from larch.utils import load_whitelist

matcher = ExactMatcher()
match = matcher(
    text="paradox",
    values=["para", "Paradox"]
) # Output [('Paradox', 100.0)]



matcher = FuzzyMatcher()
match = matcher(
    text="parado",
    values=["para", "Paradox"]
) # [('Paradox', 92.3076923076923), ('para', 90.0)]

matcher = LLMMatcher(
    InstructorBasedOpenAIMetadataExtractor(schema=None, openai_client=OpenAI(), model="gpt-3.5-turbo", debug=False),
    debug=True,
)
match = matcher(
   text="prdx",
    values=["para", "Paradox"]
) # Output: [('Paradox', 100.0)]


whitelist_map = load_whitelist(<path_to_excel_file>)
whitelist_map = {"address": {"Huntsville": ["Huntsville", "hunsville", "huntsvil"]}},

metadata = dict(address="hunsvllle")
metadata_validated = WhitelistBasedMetadataValidatorWithMatcher(
    whitelists=whitelist_map,
    field_matcher=CombinedMatcher(ExactMatcher(), FuzzyMatcher()),
    fallback_matcher=LLMMatcher(
        InstructorBasedOpenAIMetadataExtractor(
            schema=None,
            openai_client=OpenAI(), model="gpt-3.5-turbo",
            debug=True
        ),
        debug=True,
    ),
    unmatched_value=None # If set to `None`, unmatched keys will be removed
)(metadata) # Output: {"address": "Huntsville"}