Merge pull request #62 from piatrashkakanstantinass/filtration

Filtration support
bee-san · Jun 6, 2021 · 93b3bfa · 93b3bfa
2 parents 4a799df + 95566f0
commit 93b3bfa
Show file tree

Hide file tree

Showing 13 changed files with 406 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -53,6 +53,8 @@ Or if you come across some piece of text and you don't know what it is, `What` w
 
 **File Opening** You can pass in a file path by `what "this/is/a/file/path"`. What is smart enough to figure out it's a file!
 
+**Filtration** You can filter output by using `what --rarity 0.2:0.8 --include_tags tag1,tag2 TEXT`. Use `what --help` to get more information.
+
 # 🍕 API
 
 PyWhat has an API! Click here [https://github.com/bee-san/pyWhat/wiki/API](https://github.com/bee-san/pyWhat/wiki/API) to read about it.

diff --git a/pywhat/Data/regex.json b/pywhat/Data/regex.json
@@ -424,7 +424,7 @@
        "Regex": "^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
        "plural_name": false,
        "Description": "An [#CAE4F1][link=https://en.wikipedia.org/wiki/Social_Security_number]American Identification Number[/link][/#CAE4F1]",
-       "rarity": 0.2,
+       "Rarity": 0.2,
        "Tags": [
           "Credentials",
           "Password",

diff --git a/pywhat/__init__.py b/pywhat/__init__.py
@@ -0,0 +1,8 @@
+from pywhat.distribution import Distribution
+from pywhat.helper import AvailableTags
+from pywhat.identifier import Identifier
+
+pywhat_tags = AvailableTags().get_tags()
+
+
+__all__ = ["Identifier", "Distribution", "pywhat_tags"]
diff --git a/pywhat/distribution.py b/pywhat/distribution.py
@@ -0,0 +1,86 @@
+from typing import Optional
+
+from pywhat.helper import AvailableTags, CaseInsensitiveSet, InvalidTag, load_regexes
+
+
+class Distribution:
+    """
+    A distribution is an object containing the regex
+    But the regex has gone through a filter process.
+
+    Example filters:
+    * {"Tags": ["Networking"]}
+    * {"Tags": ["Identifiers"], "ExcludeTags": ["Credentials"], "MinRarity": 0.6}
+    """
+
+    def __init__(self, filters_dict: Optional[dict] = None):
+        tags = CaseInsensitiveSet(AvailableTags().get_tags())
+        self._dict = dict()
+        if filters_dict is None:
+            filters_dict = dict()
+
+        self._dict["Tags"] = CaseInsensitiveSet(filters_dict.setdefault("Tags", tags))
+        self._dict["ExcludeTags"] = CaseInsensitiveSet(filters_dict.setdefault("ExcludeTags", set()))
+        self._dict["MinRarity"] = filters_dict.setdefault("MinRarity", 0)
+        self._dict["MaxRarity"] = filters_dict.setdefault("MaxRarity", 1)
+        if not self._dict["Tags"].issubset(tags) or not self._dict["ExcludeTags"].issubset(tags):
+            raise InvalidTag("Passed filter contains tags that are not used by 'what'")
+
+        self._regexes = load_regexes()
+        self._filter()
+
+    def _filter(self):
+        temp_regexes = []
+        min_rarity = self._dict["MinRarity"]
+        max_rarity = self._dict["MaxRarity"]
+        for regex in self._regexes:
+            if (
+                min_rarity <= regex["Rarity"] <= max_rarity
+                and set(regex["Tags"]) & self._dict["Tags"]
+                and not set(regex["Tags"]) & self._dict["ExcludeTags"]
+            ):
+                temp_regexes.append(regex)
+
+        self._regexes = temp_regexes
+
+    def get_regexes(self):
+        return list(self._regexes)
+
+    def get_filter(self):
+        return dict(self._dict)
+
+    def __repr__(self):
+        return f"Distribution({self._dict})"
+
+    def __and__(self, other):
+        if type(self) != type(other):
+            return NotImplemented
+        tags = self._dict["Tags"] & other._dict["Tags"]
+        exclude_tags = self._dict["ExcludeTags"] & other._dict["ExcludeTags"]
+        min_rarity = max(self._dict["MinRarity"], other._dict["MinRarity"])
+        max_rarity = min(self._dict["MaxRarity"], other._dict["MaxRarity"])
+        return Distribution(
+            {"Tags": tags, "ExcludeTags": exclude_tags,
+            "MinRarity": min_rarity, "MaxRarity": max_rarity})
+
+    def __or__(self, other):
+        if type(self) != type(other):
+            return NotImplemented
+        tags = self._dict["Tags"] | other._dict["Tags"]
+        exclude_tags = self._dict["ExcludeTags"] | other._dict["ExcludeTags"]
+        min_rarity = min(self._dict["MinRarity"], other._dict["MinRarity"])
+        max_rarity = max(self._dict["MaxRarity"], other._dict["MaxRarity"])
+        return Distribution(
+            {"Tags": tags, "ExcludeTags": exclude_tags,
+            "MinRarity": min_rarity, "MaxRarity": max_rarity})
+
+
+    def __iand__(self, other):
+        if type(self) != type(other):
+            return NotImplemented
+        return self & other
+
+    def __ior__(self, other):
+        if type(self) != type(other):
+            return NotImplemented
+        return self | other
diff --git a/pywhat/helper.py b/pywhat/helper.py
@@ -0,0 +1,59 @@
+"""Helper utilities"""
+import collections.abc
+import json
+import os.path
+
+
+class AvailableTags:
+    def __init__(self):
+        self.tags = set()
+        regexes = load_regexes()
+        for regex in regexes:
+            self.tags.update(regex["Tags"])
+
+    def get_tags(self):
+        return self.tags
+
+
+class InvalidTag(Exception):
+    """
+    This exception should be raised when Distribution() gets a filter
+    containing non-existent tags.
+    """
+
+    pass
+
+
+def load_regexes() -> list:
+    path = "Data/regex.json"
+    fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
+    with open(fullpath, "r", encoding="utf-8") as myfile:
+        return json.load(myfile)
+
+
+class CaseInsensitiveSet(collections.abc.Set):
+    def __init__(self, iterable=None):
+        self._elements = set()
+        if iterable is not None:
+            self._elements = set(map(self._lower, iterable))
+
+    def _lower(self, value):
+        return value.lower() if isinstance(value, str) else value
+
+    def __contains__(self, value):
+        return self._lower(value) in self._elements
+
+    def __iter__(self):
+        return iter(self._elements)
+
+    def __len__(self):
+        return len(self._elements)
+
+    def __repr__(self):
+        return self._elements.__repr__()
+
+    def issubset(self, other):
+        for value in self:
+            if value not in other:
+                return False
+        return True
diff --git a/pywhat/identifier.py b/pywhat/identifier.py
@@ -1,36 +1,47 @@
 import os.path
+from typing import List, Optional
 
+from pywhat.distribution import Distribution
 from pywhat.magic_numbers import FileSignatures
 from pywhat.nameThatHash import Nth
 from pywhat.regex_identifier import RegexIdentifier
 
 
 class Identifier:
-    def __init__(self):
-        self.regex_id = RegexIdentifier()
-        self.file_sig = FileSignatures()
-        self.name_that_hash = Nth()
-
-    def identify(self, text: str, api=False) -> dict:
+    def __init__(self, distribution: Optional[Distribution] = None):
+        if distribution is None:
+            self.distribution = Distribution()
+        else:
+            self.distribution = distribution
+        self._regex_id = RegexIdentifier()
+        self._file_sig = FileSignatures()
+        self._name_that_hash = Nth()
+
+    def identify(self, text: str, dist: Distribution = None,
+                 api=False) -> dict:
+        if dist is None:
+            dist = self.distribution
         identify_obj = {}
 
         magic_numbers = None
-        if not api and self.file_exists(text):
-            magic_numbers = self.file_sig.open_binary_scan_magic_nums(text)
-            text = self.file_sig.open_file_loc(text)
+        if not api and self._file_exists(text):
+            magic_numbers = self._file_sig.open_binary_scan_magic_nums(text)
+            text = self._file_sig.open_file_loc(text)
             identify_obj["File Signatures"] = magic_numbers
         else:
             text = [text]
 
         if not magic_numbers:
             # If file doesn't exist, check to see if the inputted text is
             # a file in hex format
-            identify_obj["File Signatures"] = self.file_sig.check_magic_nums(text)
-        identify_obj["Regexes"] = self.regex_id.check(text)
+            identify_obj["File Signatures"] = self._file_sig.check_magic_nums(text)
+
+        identify_obj["Regexes"] = self._regex_id.check(text, dist)
+
         # get_hashes takes a list of hashes, we split to give it a list
-        # identify_obj["Hashes"] = self.name_that_hash.get_hashes(text.split())
+        # identify_obj["Hashes"] = self._name_that_hash.get_hashes(text.split())
 
         return identify_obj
 
-    def file_exists(self, text):
+    def _file_exists(self, text):
         return os.path.isfile(text)
diff --git a/pywhat/printer.py b/pywhat/printer.py
@@ -10,13 +10,11 @@ def pretty_print(self, text: dict):
 
         to_out = ""
 
-        if text["File Signatures"]:
+        if text["File Signatures"] and text["Regexes"]:
             to_out += "\n"
             to_out += f"[bold #D7Afff]File Identified[/bold #D7Afff] with Magic Numbers {text['File Signatures']['ISO 8859-1']}."
             to_out += f"\n[bold #D7Afff]File Description:[/bold #D7Afff] {text['File Signatures']['Description']}."
             to_out += "\n"
-        if to_out:
-            console.print(to_out)
 
         if text["Regexes"]:
             to_out += "\n[bold #D7Afff]Possible Identification[/bold #D7Afff]"

diff --git a/pywhat/regex_identifier.py b/pywhat/regex_identifier.py
@@ -2,19 +2,21 @@
 import json
 import os
 import re
+from typing import Optional
+
+from pywhat.distribution import Distribution
 
 
 class RegexIdentifier:
     def __init__(self):
-        path = "Data/regex.json"
-        fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
-        with open(fullpath, "r", encoding="utf8") as myfile:
-            self.regexes = json.load(myfile)
+        self.distribution = Distribution()
 
-    def check(self, text):
+    def check(self, text, distribution: Optional[Distribution] = None):
+        if distribution is None:
+            distribution = self.distribution
         matches = []
         for txt in text:
-            for reg in self.regexes:
+            for reg in distribution.get_regexes():
                 matched_regex = re.search(reg["Regex"], txt, re.UNICODE)
 
                 if matched_regex:
@@ -28,8 +30,8 @@ def check(self, text):
                         codes_path = "Data/phone_codes.json"
                         codes_fullpath = os.path.join(
                             os.path.dirname(os.path.abspath(__file__)), codes_path)
-                        with open(codes_fullpath) as f:
-                            codes = json.load(f)
+                        with open(codes_fullpath, "r", encoding="utf-8") as myfile:
+                            codes = json.load(myfile)
 
                         locations = []
                         for code in codes:

diff --git a/pywhat/what.py b/pywhat/what.py
@@ -1,5 +1,49 @@
+import sys
+
 import click
+from rich.console import Console
+
 from pywhat import identifier, printer
+from pywhat.distribution import Distribution
+from pywhat.helper import AvailableTags, InvalidTag
+
+
+def print_tags(ctx, opts, value):
+    if value:
+        tags = sorted(AvailableTags().get_tags())
+        console = Console()
+        console.print("[bold #D7Afff]" + "\n".join(tags) + "[/bold #D7Afff]")
+        sys.exit()
+
+
+def parse_options(rarity, include_tags, exclude_tags):
+    filter = dict()
+    if rarity is not None:
+        rarities = rarity.split(":")
+        if len(rarities) != 2:
+            print("Invalid rarity range format ('min:max' expected)")
+            sys.exit(1)
+        try:
+            if not rarities[0].isspace() and rarities[0]:
+                filter["MinRarity"] = float(rarities[0])
+            if not rarities[1].isspace() and rarities[1]:
+                filter["MaxRarity"] = float(rarities[1])
+        except ValueError:
+            print("Invalid rarity argument (float expected)")
+            sys.exit(1)
+    if include_tags is not None:
+        filter["Tags"] = list(map(str.strip, include_tags.split(',')))
+    if exclude_tags is not None:
+        filter["ExcludeTags"] = list(map(str.strip, exclude_tags.split(',')))
+
+    try:
+        distribution = Distribution(filter)
+    except InvalidTag:
+        print("Passed tags are not valid.\n" \
+            "You can check available tags by using: 'pywhat --tags'")
+        sys.exit(1)
+
+    return distribution
 
 
 @click.command(
@@ -8,14 +52,26 @@
     )
 )
 @click.argument("text_input", required=True)
-def main(text_input):
+@click.option("-t", "--tags", is_flag=True, expose_value=False, callback=print_tags, help="Show available tags and exit.")
+@click.option("-r", "--rarity", help="Filter by rarity. This is in the range of 0:1. To filter only items past 0.5, use 0.5: with the colon on the end.")
+@click.option("-i", "--include_tags", help="Only print entries with included tags.")
+@click.option("-e", "--exclude_tags", help="Exclude tags.")
+def main(text_input, rarity, include_tags, exclude_tags):
     """
     What - Identify what something is.\n
 
     Made by Bee https://twitter.com/bee_sec_san\n
 
     https://github.com/bee-san\n
 
+    Filtration:\n
+        --rarity min:max\n
+            Only print entries with rarity in range [min,max]. min and max can be omitted.\n
+        --include_tags list\n
+            Only include entries containing at least one tag in a list. List is a comma separated list.\n
+        --include_tags list\n
+            Exclude specified tags. List is a comma separated list.\n
+
     Examples:
 
         * what "HTB{this is a flag}"
@@ -24,22 +80,27 @@ def main(text_input):
 
         * what -- 52.6169586, -1.9779857
 
+        * what --rarity 0.6: [email protected]
+
     Your text must either be in quotation marks, or use the POSIX standard of "--" to mean "anything after -- is textual input".
 
     """
 
-    what_obj = What_Object()
+    what_obj = What_Object(
+        parse_options(rarity, include_tags, exclude_tags)
+    )
     identified_output = what_obj.what_is_this(text_input)
 
     p = printer.Printing()
     p.pretty_print(identified_output)
 
 
 class What_Object:
-    def __init__(self):
-        self.id = identifier.Identifier()
+    def __init__(self, distribution):
+        self.id = identifier.Identifier(distribution)
 
-    def what_is_this(self, text: str) -> dict:
+    def what_is_this(
+        self, text: str) -> dict:
         """
         Returns a Python dictionary of everything that has been identified
         """