Skip to content

Commit

Permalink
Merge pull request #62 from piatrashkakanstantinass/filtration
Browse files Browse the repository at this point in the history
Filtration support
  • Loading branch information
bee-san authored Jun 6, 2021
2 parents 4a799df + 95566f0 commit 93b3bfa
Show file tree
Hide file tree
Showing 13 changed files with 406 additions and 35 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ Or if you come across some piece of text and you don't know what it is, `What` w

**File Opening** You can pass in a file path by `what "this/is/a/file/path"`. What is smart enough to figure out it's a file!

**Filtration** You can filter output by using `what --rarity 0.2:0.8 --include_tags tag1,tag2 TEXT`. Use `what --help` to get more information.

# 🍕 API

PyWhat has an API! Click here [https://github.com/bee-san/pyWhat/wiki/API](https://github.com/bee-san/pyWhat/wiki/API) to read about it.
Expand Down
2 changes: 1 addition & 1 deletion pywhat/Data/regex.json
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@
"Regex": "^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
"plural_name": false,
"Description": "An [#CAE4F1][link=https://en.wikipedia.org/wiki/Social_Security_number]American Identification Number[/link][/#CAE4F1]",
"rarity": 0.2,
"Rarity": 0.2,
"Tags": [
"Credentials",
"Password",
Expand Down
8 changes: 8 additions & 0 deletions pywhat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pywhat.distribution import Distribution
from pywhat.helper import AvailableTags
from pywhat.identifier import Identifier

pywhat_tags = AvailableTags().get_tags()


__all__ = ["Identifier", "Distribution", "pywhat_tags"]
86 changes: 86 additions & 0 deletions pywhat/distribution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import Optional

from pywhat.helper import AvailableTags, CaseInsensitiveSet, InvalidTag, load_regexes


class Distribution:
"""
A distribution is an object containing the regex
But the regex has gone through a filter process.
Example filters:
* {"Tags": ["Networking"]}
* {"Tags": ["Identifiers"], "ExcludeTags": ["Credentials"], "MinRarity": 0.6}
"""

def __init__(self, filters_dict: Optional[dict] = None):
tags = CaseInsensitiveSet(AvailableTags().get_tags())
self._dict = dict()
if filters_dict is None:
filters_dict = dict()

self._dict["Tags"] = CaseInsensitiveSet(filters_dict.setdefault("Tags", tags))
self._dict["ExcludeTags"] = CaseInsensitiveSet(filters_dict.setdefault("ExcludeTags", set()))
self._dict["MinRarity"] = filters_dict.setdefault("MinRarity", 0)
self._dict["MaxRarity"] = filters_dict.setdefault("MaxRarity", 1)
if not self._dict["Tags"].issubset(tags) or not self._dict["ExcludeTags"].issubset(tags):
raise InvalidTag("Passed filter contains tags that are not used by 'what'")

self._regexes = load_regexes()
self._filter()

def _filter(self):
temp_regexes = []
min_rarity = self._dict["MinRarity"]
max_rarity = self._dict["MaxRarity"]
for regex in self._regexes:
if (
min_rarity <= regex["Rarity"] <= max_rarity
and set(regex["Tags"]) & self._dict["Tags"]
and not set(regex["Tags"]) & self._dict["ExcludeTags"]
):
temp_regexes.append(regex)

self._regexes = temp_regexes

def get_regexes(self):
return list(self._regexes)

def get_filter(self):
return dict(self._dict)

def __repr__(self):
return f"Distribution({self._dict})"

def __and__(self, other):
if type(self) != type(other):
return NotImplemented
tags = self._dict["Tags"] & other._dict["Tags"]
exclude_tags = self._dict["ExcludeTags"] & other._dict["ExcludeTags"]
min_rarity = max(self._dict["MinRarity"], other._dict["MinRarity"])
max_rarity = min(self._dict["MaxRarity"], other._dict["MaxRarity"])
return Distribution(
{"Tags": tags, "ExcludeTags": exclude_tags,
"MinRarity": min_rarity, "MaxRarity": max_rarity})

def __or__(self, other):
if type(self) != type(other):
return NotImplemented
tags = self._dict["Tags"] | other._dict["Tags"]
exclude_tags = self._dict["ExcludeTags"] | other._dict["ExcludeTags"]
min_rarity = min(self._dict["MinRarity"], other._dict["MinRarity"])
max_rarity = max(self._dict["MaxRarity"], other._dict["MaxRarity"])
return Distribution(
{"Tags": tags, "ExcludeTags": exclude_tags,
"MinRarity": min_rarity, "MaxRarity": max_rarity})


def __iand__(self, other):
if type(self) != type(other):
return NotImplemented
return self & other

def __ior__(self, other):
if type(self) != type(other):
return NotImplemented
return self | other
59 changes: 59 additions & 0 deletions pywhat/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Helper utilities"""
import collections.abc
import json
import os.path


class AvailableTags:
def __init__(self):
self.tags = set()
regexes = load_regexes()
for regex in regexes:
self.tags.update(regex["Tags"])

def get_tags(self):
return self.tags


class InvalidTag(Exception):
"""
This exception should be raised when Distribution() gets a filter
containing non-existent tags.
"""

pass


def load_regexes() -> list:
path = "Data/regex.json"
fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
with open(fullpath, "r", encoding="utf-8") as myfile:
return json.load(myfile)


class CaseInsensitiveSet(collections.abc.Set):
def __init__(self, iterable=None):
self._elements = set()
if iterable is not None:
self._elements = set(map(self._lower, iterable))

def _lower(self, value):
return value.lower() if isinstance(value, str) else value

def __contains__(self, value):
return self._lower(value) in self._elements

def __iter__(self):
return iter(self._elements)

def __len__(self):
return len(self._elements)

def __repr__(self):
return self._elements.__repr__()

def issubset(self, other):
for value in self:
if value not in other:
return False
return True
37 changes: 24 additions & 13 deletions pywhat/identifier.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,47 @@
import os.path
from typing import List, Optional

from pywhat.distribution import Distribution
from pywhat.magic_numbers import FileSignatures
from pywhat.nameThatHash import Nth
from pywhat.regex_identifier import RegexIdentifier


class Identifier:
def __init__(self):
self.regex_id = RegexIdentifier()
self.file_sig = FileSignatures()
self.name_that_hash = Nth()

def identify(self, text: str, api=False) -> dict:
def __init__(self, distribution: Optional[Distribution] = None):
if distribution is None:
self.distribution = Distribution()
else:
self.distribution = distribution
self._regex_id = RegexIdentifier()
self._file_sig = FileSignatures()
self._name_that_hash = Nth()

def identify(self, text: str, dist: Distribution = None,
api=False) -> dict:
if dist is None:
dist = self.distribution
identify_obj = {}

magic_numbers = None
if not api and self.file_exists(text):
magic_numbers = self.file_sig.open_binary_scan_magic_nums(text)
text = self.file_sig.open_file_loc(text)
if not api and self._file_exists(text):
magic_numbers = self._file_sig.open_binary_scan_magic_nums(text)
text = self._file_sig.open_file_loc(text)
identify_obj["File Signatures"] = magic_numbers
else:
text = [text]

if not magic_numbers:
# If file doesn't exist, check to see if the inputted text is
# a file in hex format
identify_obj["File Signatures"] = self.file_sig.check_magic_nums(text)
identify_obj["Regexes"] = self.regex_id.check(text)
identify_obj["File Signatures"] = self._file_sig.check_magic_nums(text)

identify_obj["Regexes"] = self._regex_id.check(text, dist)

# get_hashes takes a list of hashes, we split to give it a list
# identify_obj["Hashes"] = self.name_that_hash.get_hashes(text.split())
# identify_obj["Hashes"] = self._name_that_hash.get_hashes(text.split())

return identify_obj

def file_exists(self, text):
def _file_exists(self, text):
return os.path.isfile(text)
4 changes: 1 addition & 3 deletions pywhat/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ def pretty_print(self, text: dict):

to_out = ""

if text["File Signatures"]:
if text["File Signatures"] and text["Regexes"]:
to_out += "\n"
to_out += f"[bold #D7Afff]File Identified[/bold #D7Afff] with Magic Numbers {text['File Signatures']['ISO 8859-1']}."
to_out += f"\n[bold #D7Afff]File Description:[/bold #D7Afff] {text['File Signatures']['Description']}."
to_out += "\n"
if to_out:
console.print(to_out)

if text["Regexes"]:
to_out += "\n[bold #D7Afff]Possible Identification[/bold #D7Afff]"
Expand Down
18 changes: 10 additions & 8 deletions pywhat/regex_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
import json
import os
import re
from typing import Optional

from pywhat.distribution import Distribution


class RegexIdentifier:
def __init__(self):
path = "Data/regex.json"
fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
with open(fullpath, "r", encoding="utf8") as myfile:
self.regexes = json.load(myfile)
self.distribution = Distribution()

def check(self, text):
def check(self, text, distribution: Optional[Distribution] = None):
if distribution is None:
distribution = self.distribution
matches = []
for txt in text:
for reg in self.regexes:
for reg in distribution.get_regexes():
matched_regex = re.search(reg["Regex"], txt, re.UNICODE)

if matched_regex:
Expand All @@ -28,8 +30,8 @@ def check(self, text):
codes_path = "Data/phone_codes.json"
codes_fullpath = os.path.join(
os.path.dirname(os.path.abspath(__file__)), codes_path)
with open(codes_fullpath) as f:
codes = json.load(f)
with open(codes_fullpath, "r", encoding="utf-8") as myfile:
codes = json.load(myfile)

locations = []
for code in codes:
Expand Down
71 changes: 66 additions & 5 deletions pywhat/what.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,49 @@
import sys

import click
from rich.console import Console

from pywhat import identifier, printer
from pywhat.distribution import Distribution
from pywhat.helper import AvailableTags, InvalidTag


def print_tags(ctx, opts, value):
if value:
tags = sorted(AvailableTags().get_tags())
console = Console()
console.print("[bold #D7Afff]" + "\n".join(tags) + "[/bold #D7Afff]")
sys.exit()


def parse_options(rarity, include_tags, exclude_tags):
filter = dict()
if rarity is not None:
rarities = rarity.split(":")
if len(rarities) != 2:
print("Invalid rarity range format ('min:max' expected)")
sys.exit(1)
try:
if not rarities[0].isspace() and rarities[0]:
filter["MinRarity"] = float(rarities[0])
if not rarities[1].isspace() and rarities[1]:
filter["MaxRarity"] = float(rarities[1])
except ValueError:
print("Invalid rarity argument (float expected)")
sys.exit(1)
if include_tags is not None:
filter["Tags"] = list(map(str.strip, include_tags.split(',')))
if exclude_tags is not None:
filter["ExcludeTags"] = list(map(str.strip, exclude_tags.split(',')))

try:
distribution = Distribution(filter)
except InvalidTag:
print("Passed tags are not valid.\n" \
"You can check available tags by using: 'pywhat --tags'")
sys.exit(1)

return distribution


@click.command(
Expand All @@ -8,14 +52,26 @@
)
)
@click.argument("text_input", required=True)
def main(text_input):
@click.option("-t", "--tags", is_flag=True, expose_value=False, callback=print_tags, help="Show available tags and exit.")
@click.option("-r", "--rarity", help="Filter by rarity. This is in the range of 0:1. To filter only items past 0.5, use 0.5: with the colon on the end.")
@click.option("-i", "--include_tags", help="Only print entries with included tags.")
@click.option("-e", "--exclude_tags", help="Exclude tags.")
def main(text_input, rarity, include_tags, exclude_tags):
"""
What - Identify what something is.\n
Made by Bee https://twitter.com/bee_sec_san\n
https://github.com/bee-san\n
Filtration:\n
--rarity min:max\n
Only print entries with rarity in range [min,max]. min and max can be omitted.\n
--include_tags list\n
Only include entries containing at least one tag in a list. List is a comma separated list.\n
--include_tags list\n
Exclude specified tags. List is a comma separated list.\n
Examples:
* what "HTB{this is a flag}"
Expand All @@ -24,22 +80,27 @@ def main(text_input):
* what -- 52.6169586, -1.9779857
* what --rarity 0.6: [email protected]
Your text must either be in quotation marks, or use the POSIX standard of "--" to mean "anything after -- is textual input".
"""

what_obj = What_Object()
what_obj = What_Object(
parse_options(rarity, include_tags, exclude_tags)
)
identified_output = what_obj.what_is_this(text_input)

p = printer.Printing()
p.pretty_print(identified_output)


class What_Object:
def __init__(self):
self.id = identifier.Identifier()
def __init__(self, distribution):
self.id = identifier.Identifier(distribution)

def what_is_this(self, text: str) -> dict:
def what_is_this(
self, text: str) -> dict:
"""
Returns a Python dictionary of everything that has been identified
"""
Expand Down
Loading

0 comments on commit 93b3bfa

Please sign in to comment.