This repository has been archived by the owner on Dec 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
AntiransomwareFiltersMerge.py
755 lines (654 loc) · 44.1 KB
/
AntiransomwareFiltersMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
#! /usr/bin/env python3
"""
Processes JSON formatted input(s) continaining lists of filter strings for use with Windows FSRM.
Run with -h parameter for usage details.
Notes:
This script assumes that the filters will be applied to a case INSENSITIVE file system using Microsoft Windows' FSRM.
The regex optimization section of this script is case INSENSITIVE (in two places). Everything else in this script is case sensitive.
Filters processing is lossless. Once a filespec is added to "filters" will never be removed. You have these options for recourse.
1. EITHER add the exact matching string to extended-data.excludefromfilters OR match with regex and replace with an empty string (best, easiest)
2. start over and use the --reloadfromsecondaryfilters option, the secondary may still contain the filter you want to remove so you may need option 1 anyway
3. build an empty skeleton JSON file and load it with a fresh set of filters AND manage your secondary input filters manually. (hardest)
regex whole string matches may use empty strings "" as their replacement
pretty print formatted JSON will sometimes fail on PowerShell v4 and below Get-Content when piplined to ConvertFrom-Json
in PowerShell do this: Get-Content -Raw | Convert-FromJson
the "-Raw" flag fixes the problem, forces all json to be stored in a single string, otherwise each line goes into an array of strings
alternative: use the -c/--compactjson option in this program
all input JSON data is assumed to be encoded in UTF-8 Unicode format because many ransomware file names have Unicode characters
all string operations are Unicode conforming
all internal string handling is 100% Unicode, all wide all the time
the input files MAY use ascii escaping of Unicode characters
JSON output file WILL use ascii escaping of Unicode characters
text output files are non-escaped Unicode
all output files (JSON and text files) are encoded in UTF-8 Unicode but with no BOM marker
(a BOM should never be necessary for Utf-8)
"""
# version 3.1.1 - minor bugfix runtimecontrol to runcontrol in defs, primary JSON attribute name changes, clarify description + notes at top
import string, json, re, urllib, urllib.request, datetime, pathlib, copy, argparse, stat, logging
# initialize psuedo constants
INTERNAL_NAME = 'AntiransomwareFiltersMerge.py'
VERSION = '3.1.1'
DEBUG_SIMULATATE_COMMAND_LINE = False
# for testing only - a way to fake command line args
if DEBUG_SIMULATATE_COMMAND_LINE:
print ('\nXXXXX: Warning - debugging command line override is enabled in source code, replaces command line arguments pre-parsing :XXXXX')
import sys
# sys.argv = [sys.argv[0], '-h']
sys.argv = [sys.argv[0], '-a', '-n', 'pinkard', '-g']
# sys.argv = [sys.argv[0], '-v', '-f', '-a', '-t','1', '-o', '-s', 'combined-20190925.json'] # tested local secondary
# sys.argv = [sys.argv[0], '-k','-n','blizard'] # generated a skeleton, skipped blizard, correct behavior
# sys.argv = [sys.argv[0], '-d','-o','-w','C:\\Temp\\scratch']
def IntializeApp()->dict:
'''Performs all program initialization EXCEPT the output JSON dictionary.
Notes:
!! hard coding note !! - too many hard coded items to list individually, most hard coding is here (where it belongs)
evaluates command line options with argparse, generates td strings, verifies working directory and optional input JSON
override file and secondary input JSON file paths, sets verbosity for logging, loads runtime control baton
Inputs:
none passed in
argv
Outputs:
returns - a runtime control baton (dictionary) loaded with all runtime control items including operational flags, file name wildcards,
working directory, URL, output file name, etc.
'''
# init our runtime control baton dict
runcontrol = {}
# just a simple line return visual cue that the app has started running, only cosmetic, a little seperation from the prompt never hurts
print()
# setup logging
consoleformatter = logging.Formatter(fmt='%(message)s - %(levelname)s')
consolehandler = logging.StreamHandler()
consolehandler.setFormatter(consoleformatter)
runcontrol['log'] = logging.getLogger(INTERNAL_NAME)
runcontrol['log'].addHandler(consolehandler)
# command line parsing
argsparser = argparse.ArgumentParser(allow_abbrev=False, description='Fighting ransomware everyday: This program is an accessory to FSRM-Anti-ransomware.ps1 PowerShell script, also available from us, that helps you fight ransomware on your Windows file servers using File Server Resource Manager (FSRM) file screening functionality. This program allows you to manage and extend a basic combined.json file (updated frequently and available from fsrm.experiant.ca) so that you can easily manage your allowed file names and special exceptions while keeping your file groups updated with the latest ransomware file specifications. We are not affiliated in any way with Experiant but we very deeply appreciate all their efforts to keep us safe from ransomware by keeping track of the latest ransomware threats in the wild. Getting Started: Use this program to generate a skeleton, rename the skeleton file by changing the substring in the middle, load that skeleton with additional information such as allowed file specs and exception file names, then use this program to merge your customized information with updated filters from third parties like Experiant. Use the fnamesubstring option to match your file name or just use the default of "extended" as your substring. This program will never delete any filters you\'ve added unless you refresh all the filters from the secondary source.',epilog='This is part of an important security implementation that leverages Microsoft\'s FSRM. Make sure you understand the options in both FSRM and this program. Take your time. Be deliberate.')
argsparser.add_argument('-n', '--fnamesubstring', type=str, default='extended', help = 'substring for both for wildcard file name matching of input primary JSON file as well as naming the output JSON file, default substring is "extended", format example: "combined-extended-20191031_123456.json"')
argsparser.add_argument('-p', '--primaryjsonoverride', type=str,help='primary JSON input file, overrides default primary file found through wildcard matching with fnamesubstring, use full file path name, does not affect output JSON file name')
filtersourcegroup = argsparser.add_mutually_exclusive_group()
filtersourcegroup.add_argument('-g', '--groomingonly', default=False, action='store_true', help='groom only performs grooming on primary JSON, useful to enable or disable optimization without adding new filters, secondary JSON will be ignored, may not be used with --reloadfromsecondaryfilters')
filtersourcegroup.add_argument('-r', '--reloadfromsecondaryfilters', default=False, action='store_true',help='refresh "filters" from secondary, only secondary JSON filters will be processed, primary JSON filters will be lost, all other data from primary JSON will be carried forward, be careful with this option because you will lose any of your manually added filters, may not be used with --groomingonly')
secondaryjsongroup = argsparser.add_mutually_exclusive_group()
secondaryjsongroup.add_argument('-u', '--url', type=str, default='https://fsrm.experiant.ca/api/v1/combined', help='download URL for secondary JSON filters data, program default is Experiant\'s up to date combined.json from: https://fsrm.experiant.ca/api/v1/combined, may not be used with --localsecondaryjson or -groomingonly')
secondaryjsongroup.add_argument('-s', '--localsecondaryjson', type=str, default=None, help='local file for secondary JSON input, alternative to downloading, use explict path and name, may not be used with --url or -groomingonly')
argsparser.add_argument('-o', '--optimizefilters', default=False, action='store_true', help = 'agressive optimization of filters using regex that you\'ve added to the primary input json, use with caution, test results before moving into production')
argsparser.add_argument('-c', '--compactjson', default=False, action='store_true', help = 'disables pretty print JSON, only necessary if your PowerShell JSON parsing is failing')
argsparser.add_argument('-a', '--ancillarytextfiles', default=False, action='store_true', help = 'write ancillary text files')
argsparser.add_argument('-k', '--skeletononly', default=False, action='store_true', help = 'creates a minimally initialized JSON file, FNAMESUBSTRING will always be set to "skeleton", any FNAMESUBSTRING specified on the command line will be ignorned')
argsparser.add_argument('-w', '--workingdirectory', type=str, default='', help='working directory for source and destination of files, if not specified then uses the OS current working directory')
outputwritecontrol = argsparser.add_mutually_exclusive_group()
outputwritecontrol.add_argument('-d', '--dryrun', default=False, action='store_true', help = 'dry run, output files are not written, may not be used with --force')
outputwritecontrol.add_argument('-f', '--force', default=False, action='store_true', help = 'force output files to be written even if no changes were detected, may not be used with --dryrun')
# note: we're going to rotate files by default, this program could be called for years without being checked, we don't want to fill their hard drive with files
# I see it as the user implicitly approving this behavior
argsparser.add_argument('-t', '--rotatefilescount', default=5, type=int, help = 'keep ROTATEFILESCOUNT most recent files then rotate, program default: 5, 0 to disable, (integer expected)')
argsparser.add_argument('-V', '--version', default=False, action='store_true', help = 'display version and exit')
verbositycontrol = argsparser.add_mutually_exclusive_group()
verbositycontrol.add_argument('-v', '--verbose', default=False, action='store_true', help = 'verbose output - all debug messages displayed, may not be used with --quiet')
verbositycontrol.add_argument('-q', '--quiet', default=False, action='store_true', help = 'quiet output - only warnings and errors displayed, may not be used with --verbose')
cmdlineargs = argsparser.parse_args()
## special case -V/--version, mimics -h raising of SystemExit ##
if cmdlineargs.version:
runcontrol['log'].setLevel(logging.DEBUG)
runcontrol['log'].info(INTERNAL_NAME+'\nVersion:'+VERSION+'\nRun with -h / --help parameter for usage details.')
raise SystemExit
# set logging level, be explicit, default is to show warning and above only
if cmdlineargs.verbose:
runcontrol['log'].setLevel(logging.DEBUG)
elif cmdlineargs.quiet:
runcontrol['log'].setLevel(logging.WARNING)
else:
runcontrol['log'].setLevel(logging.INFO)
runcontrol['log'].debug('initializing application runtime control')
# tell the user about any significant behavior they should expect, especially dry run since they may miss an updated filter list
# should be warnings
if cmdlineargs.dryrun:
runcontrol['log'].warning('--dryrun specified on command line, no files will be written')
if cmdlineargs.groomingonly:
runcontrol['log'].warning('--groomingonly specified on command line, reprocessing primary filters only')
if cmdlineargs.reloadfromsecondaryfilters:
runcontrol['log'].warning('--reloadfromsecondaryfilters specified on command line, reloading from secondary filter source only, primary filters will be discarded')
if cmdlineargs.skeletononly:
runcontrol['log'].warning('--skeletononly specified on command line, creating an empty primary JSON file only, example name format: "combined-skeleton-20100101-123456"')
if cmdlineargs.optimizefilters:
runcontrol['log'].warning('--optimizefilters specified on command line, applying regex optimizations to merged filters, test your results thoroughly')
#just info
if cmdlineargs.primaryjsonoverride:
runcontrol['log'].info('--primaryjsonoverride specified on command line, primary input JSON file is : ' + cmdlineargs.primaryjsonoverride)
if cmdlineargs.localsecondaryjson:
runcontrol['log'].info('--localsecondaryjson specified on command line, secondary input JSON file is : ' + cmdlineargs.localsecondaryjson)
if cmdlineargs.force:
runcontrol['log'].info('--force specified on command line, files will be written even if no changes are detected')
# a little verbose but we'll put all our fname parts here, easy to customize, easy to convert to command line parms
fnamesep = '-'
fnameprefix = 'combined' # this matches the prefix for Experiant supplied JSON files
fnamesubstring = cmdlineargs.fnamesubstring
if cmdlineargs.skeletononly:
fnamesubstring = 'skeleton'
fnametimestampwildcard = '????????_??????' # this matches our file name datestamp format
fnamejsonext = '.json'
fnametextprefix = 'filters'
fnameonepersubstring = 'one_per_line_utf8'
fnamescrptfrmtsubstring = 'script_formatted_utf8'
fnametextext = '.txt'
# now see if our directories and input files exist, no sense doing anything else until we know
try:
runcontrol['workdirpath'] = pathlib.Path(cmdlineargs.workingdirectory).resolve()
runcontrol['workdirpath'].exists()
except:
runcontrol['log'].error('cannot find working directory: ' + cmdlineargs.workingdirectory)
raise
else:
# ok, our starting point path exists, now we look for special cases too
# primary json file
if cmdlineargs.primaryjsonoverride:
try:
fnamepathtest = pathlib.Path(cmdlineargs.primaryjsonoverride).resolve()
fnamepathtest.exists()
except:
runcontrol['log'].error('primary input JSON file specified on command line not found')
raise
else:
runcontrol['primaryjsonoverridepath'] = fnamepathtest
# secondary json file, don't look for it if skeletononly option enabled
if cmdlineargs.localsecondaryjson and not cmdlineargs.skeletononly:
try:
fnamepathtest = pathlib.Path(cmdlineargs.localsecondaryjson).resolve()
fnamepathtest.exists()
except:
runcontrol['log'].error('secondary JSON file specified on command line not found')
raise
else:
runcontrol['secondaryjsonpath'] = fnamepathtest
# time and date strings
nowtimezulu = datetime.datetime.utcnow()
# time date string for json embedded data
runcontrol['nowstringzulu'] = nowtimezulu.strftime('%Y-%m-%dT%H:%M:%S.%fZ') # note: actually relying on the microseconds would be silly, it's just for completeness
# local time zone time date string for adding to file names
runcontrol['nowsubstrlocltz'] = nowtimezulu.replace(tzinfo=datetime.timezone.utc).astimezone(tz=None).strftime('%Y%m%d_%H%M%S')
# wildcard for finding all matching input candidate JSON files
runcontrol['jsonfnamewildcard'] = (fnameprefix+fnamesep+fnamesubstring+fnamesep+fnametimestampwildcard+fnamejsonext)
# full path for new output JSON file
jsonopfname = (fnameprefix+fnamesep+fnamesubstring+fnamesep+runcontrol['nowsubstrlocltz']+fnamejsonext)
runcontrol['opjsonpath'] = pathlib.Path(runcontrol['workdirpath'], jsonopfname).resolve()
# full paths and matching wildcards for filters only ancillary text files
pathtemp = (fnametextprefix+fnamesep+fnamesubstring+fnamesep+fnameonepersubstring+fnamesep+runcontrol['nowsubstrlocltz']+fnametextext)
runcontrol['opfiltersoneperlnpath'] = pathlib.Path(runcontrol['workdirpath'], pathtemp).resolve()
runcontrol['opfiltersoneperlnwildcard'] = (fnametextprefix+fnamesep+fnamesubstring+fnamesep+fnameonepersubstring+fnamesep+fnametimestampwildcard+fnametextext)
pathtemp = (fnametextprefix+fnamesep+fnamesubstring+fnamesep+fnamescrptfrmtsubstring+fnamesep+runcontrol['nowsubstrlocltz']+fnametextext)
runcontrol['opfiltersPSlistpath'] = pathlib.Path(runcontrol['workdirpath'], pathtemp).resolve()
runcontrol['opfiltersPSlistwildcard'] = (fnametextprefix+fnamesep+fnamesubstring+fnamesep+fnamescrptfrmtsubstring+fnamesep+fnametimestampwildcard+fnametextext)
# keys for looping for file rotation, if you add an additional text file type then add the wildcard key to this list
runcontrol['opfilterstextrotatekeys'] = ['opfiltersoneperlnwildcard', 'opfiltersPSlistwildcard']
# full copy of the command line args object
runcontrol['cmdlineargs'] = copy.deepcopy(cmdlineargs)
runcontrol['log'].debug('runtime control baton initialized')
return runcontrol
# initializes the output JSON data structure, there are quite a few hard coded items in here
def IntializeData(runcontrol:dict)->dict:
'''This initializes the output JSON dictionary only.
Notes:
!! hard coding note !! - hard coding items in here are limited to formatting and pre-loading of the output JSON dictionary
Inputs:
runtime control baton
Outputs:
returns - initialized but otherwise empty ouput JSON dictonary
'''
runcontrol['log'].debug('initializing application data')
EXTENDED_JSON_VERSION = 3.1 # format version of JSON ouput file, change this any time the skeleton format is changed
skeleton = dict.fromkeys(['api','lastUpdated','exceptions','filters','extended-data'])
skeleton['api'] = dict.fromkeys(['format','file_group_count','extended-info'])
skeleton['api']['extended-info'] = dict.fromkeys(['extended-version','SecondaryLastUpdated','thisfilename','optimizationwarning','thisfileoptimized'])
skeleton['extended-data'] = dict.fromkeys(['excludefromfilters','deltasinceprev','regexsubstringsubs','regexsummarizations','losslesstracking'])
skeleton['extended-data']['losslesstracking'] = dict.fromkeys(['addedfilters','removedfilters'])
skeleton['api']['format'] = 'json'
skeleton['api']['file_group_count'] = 0
skeleton['api']['extended-info']['extended-version'] = EXTENDED_JSON_VERSION
skeleton['api']['extended-info']['SecondaryLastUpdated'] = ''
skeleton['api']['extended-info']['thisfilename'] = runcontrol['opjsonpath'].name
skeleton['api']['extended-info']['optimizationwarning'] = 'Never manually edit the filters list in this file if the data is optimized! Use --groomonly to un-optimize first. Never manually edit losslesstracking.addedfilters or losslesstracking.removedfilters; data loss is guaranteed if you do.'
skeleton['api']['extended-info']['thisfileoptimized'] = False
skeleton['lastUpdated'] = runcontrol['nowstringzulu']
skeleton['exceptions'] = []
skeleton['filters'] = []
skeleton['extended-data']['excludefromfilters'] = []
skeleton['extended-data']['deltasinceprev'] = []
# list of dictionaries
skeleton['extended-data']['regexsubstringsubs'] = []
skeleton['extended-data']['regexsubstringsubs'].append(dict())
# list of dictionaries
skeleton['extended-data']['regexsummarizations'] = []
skeleton['extended-data']['regexsummarizations'].append(dict())
skeleton['extended-data']['losslesstracking']['addedfilters'] = []
skeleton['extended-data']['losslesstracking']['removedfilters'] = []
runcontrol['log'].debug('output JSON data structure initialized')
return skeleton
def FindLoadPrimaryJSON(runcontrol:dict)->dict:
'''This globs for, and then loads the primary input JSON data.
Notes:
uses wildcard glob info from the runtime control baton
assumes sorted list of matched files, -1 index is newest
Inputs:
runtime control baton
Outputs:
returns - dictionary loaded with primary JSON data
'''
runcontrol['log'].debug('primary input JSON loading')
infilelist = list(runcontrol['workdirpath'].glob(runcontrol['jsonfnamewildcard'])) # list of pathlib.Path objects
infilelist.sort()
try:
with open(infilelist[-1], 'r', encoding='utf-8') as infilejson:
pjsondata = json.load(infilejson)
except json.decoder.JSONDecodeError:
runcontrol['log'].error('unable to parse JSON data from ' + infilejson.name)
raise
except(LookupError,OSError):
# LookupError is when glob doesn't find anything and the -1 index is invalid, OSError will catch any related open or close exceptions
runcontrol['log'].error('no primary input JSON candidate files found - working directory: '+runcontrol['workdirpath'].name+' wildcard mask: '+runcontrol['jsonfnamewildcard'])
raise
except Exception:
runcontrol['log'].error('unhandled exception during primary input JSON file - working directory: '+runcontrol['workdirpath'].name+' wildcard mask: '+runcontrol['jsonfnamewildcard'])
raise
else:
runcontrol['log'].info('primary input JSON read from file: ' + infilelist[-1].name)
return pjsondata
def FindLoadSecondaryJSON(runcontrol:dict)->dict:
'''This downloads or reads from file updated filters.
Notes:
!! hard coding note !! - http header info is hard coded and formatted esxpecially for https://fsrm.experiant.ca needs
either downloads via http or reads from file
file load uses specific file name passed in, no globbing
Inputs:
runtime control baton
Outputs:
returns - dictionary loaded with secondary JSON data
'''
runcontrol['log'].debug('secondary input JSON loading')
if not runcontrol['cmdlineargs'].localsecondaryjson:
runcontrol['log'].debug('secondary JSON data downloading from ' + runcontrol['cmdlineargs'].url)
urlopener = urllib.request.build_opener()
# hard coded header info, may need to be tweaked someday, Experiant definitely won't work without this
urlopener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(urlopener)
try:
with urllib.request.urlopen(runcontrol['cmdlineargs'].url) as openedURL:
rawdatadownload = openedURL.read()
encoding = openedURL.info().get_content_charset('utf-8')
sjsondata = json.loads(rawdatadownload.decode(encoding))
except json.decoder.JSONDecodeError:
runcontrol['log'].error('unable to parse secondary JSON data downloaded from ' + runcontrol['cmdlineargs'].url)
raise
except:
runcontrol['log'].error('secondary input JSON download failed, consider using a local secondary JSON data file instead')
raise
else:
runcontrol['log'].info('secondary input JSON downloaded from: ' + runcontrol['cmdlineargs'].url)
return sjsondata
else: # else use a local JSON file
runcontrol['log'].debug('loading secondary JSON from file ' + runcontrol['secondaryjsonpath'].name)
try:
with open(runcontrol['secondaryjsonpath'], 'r', encoding='utf-8') as infilejson:
sjsondata = json.load(infilejson)
except json.decoder.JSONDecodeError:
runcontrol['log'].error('unable to parse secondary JSON data from ' + runcontrol['secondaryjsonpath'].name)
raise
except(OSError):
runcontrol['log'].error('secondary input JSON file found - specified file: '+runcontrol['secondaryjsonpath'].name)
raise
except Exception:
runcontrol['log'].error('unhandled exception reading secondary input JSON file: '+runcontrol['secondaryjsonpath'].name)
raise
else:
runcontrol['log'].info('secondary input JSON read from file: ' + runcontrol['secondaryjsonpath'].name)
return sjsondata
def CombinedDataProcessing(opjson:dict, pconstjsondata:dict, sconstjsondata:dict, runcontrol:dict)->None:
'''This handles all data processing work EXCEPT advanced optimizations.
Notes:
this processing will move filters between various keys in the data dictionary, but the filters are never lost, entire process is lossless
Inputs:
initialized output JSON dictionary
primary input JSON dictionary
secondary input JSON dictionary
runtime control baton
Outputs:
output JSON dictionary
'''
runcontrol['log'].debug('merging and processing all data')
if pconstjsondata['api']['extended-info']['extended-version'] != opjson['api']['extended-info']['extended-version']:
if pconstjsondata['api']['extended-info']['extended-version'] == 3:
#update JSON data, simply copy allowed to new attribute, only a name update - no transform, we'll just leave the old key alone because we'll never use it again
pconstjsondata['extended-data']['excludefromfilters'] = pconstjsondata['extended-data']['allowed']
del(pconstjsondata['extended-data']['allowed'])
pconstjsondata['extended-data']['losslesstracking'] = pconstjsondata['extended-data']['opttracking'] # shallow copy is fine here
del(pconstjsondata['extended-data']['opttracking'])
else:
runcontrol['log'].error('primary input JSON format is an unsupported version ' + str(opjson['api']['extended-info']['extended-version']) +' format')
raise ValueError
# from this point forward we assume that the data is in the latest JSON format
# copy the data that is never changed by this program, it is user edited in the JSON and static here
# we need allowed, exceptions, both reggy
opjson['extended-data']['excludefromfilters'].extend(pconstjsondata['extended-data']['excludefromfilters'])
opjson['extended-data']['excludefromfilters'].sort()
opjson['exceptions'].extend(pconstjsondata['exceptions'])
opjson['exceptions'].sort()
opjson['extended-data']['regexsubstringsubs'] = copy.deepcopy(pconstjsondata['extended-data']['regexsubstringsubs']) # nested data, use deepcopy
opjson['extended-data']['regexsummarizations'] = copy.deepcopy(pconstjsondata['extended-data']['regexsummarizations']) # nested data, use deepcopy
# added and removed filters will be pulled from pconstjsondata, we don't need copies because it is dynamically created with each pass if optimized
if not runcontrol['cmdlineargs'].reloadfromsecondaryfilters:
try:
# dedupe the filters, someone may have added something to the filters manually and dupes will wreck the unwind
opjson['filters'].extend(list(dict.fromkeys(pconstjsondata['filters'])))
runcontrol['log'].info(str(len(pconstjsondata['filters'])) + ' filters read from primary input JSON')
except KeyError:
runcontrol['log'].error('no filters attribute in input JSON data - mandatory attribute, may be empty but must exist')
raise
except:
runcontrol['log'].error('unexpected error in primary JSON filters data')
raise
else:
# unwind previous optimizations, if any
if len(pconstjsondata['extended-data']['losslesstracking']['removedfilters']) > 0 :
# defensively dedupe the added filters, shouldn't need to do this UNLESS someone edits the file, dupes will blow up
localaddedlist = list(dict.fromkeys(pconstjsondata['extended-data']['losslesstracking']['addedfilters']))
# remove previously added optimized filters first, then put original filters back, filters list will be lossless of original values
# un-add the added filters first in case there was a collision with the removed filters, assumes filters list was deduped
for filterstring in localaddedlist:
opjson['filters'].remove(filterstring)
# un-remove - now restore all previously removed filters, includes allowed filters that were removed
opjson['filters'].extend(pconstjsondata['extended-data']['losslesstracking']['removedfilters'])
opjson['filters'] = (list(dict.fromkeys(opjson['filters'])))
else:
runcontrol['log'].warning('-refreshsecondary option specified, primary JSON filters will not be used, all other primary JSON data is carried over')
# merge with secondary if necessary
if not runcontrol['cmdlineargs'].groomingonly:
# we can't ask forgiveness for this one, empty filters list would fail silently
try:
# two tests in one, the -1 index will blow up if the list is empty, a little sanity checking to be sure input is a list of strings too
if not type(sconstjsondata['filters'][-1]) == str: pass
except: # I expect KeyError,IndexError for non-existent and empty respectively, possibly TypeError too, but we'll intentionally catch everything and let it traceback
runcontrol['log'].error('no secondary filters found or invalid secondary filters found, check secondary input JSON data')
raise
opjson['filters'].extend(sconstjsondata['filters'])
opjson['api']['extended-info']['SecondaryLastUpdated'] = sconstjsondata['lastUpdated']
runcontrol['log'].info(str(len(sconstjsondata['filters'])) + ' secondary JSON filters merged')
else:
runcontrol['log'].debug('secondary JSON filters not used due to -groomingonly option')
# finally dedupe our newly built filters list
opjson['filters'] = list(dict.fromkeys(opjson['filters']))
runcontrol['log'].info(str(len(opjson['filters'])) + ' merged and deduped filters pre-optimization')
# process allowed fspecs, remove from filters, add to removed
for filterstring in opjson['extended-data']['excludefromfilters']:
if filterstring in opjson['filters']:
opjson['extended-data']['losslesstracking']['removedfilters'].append(filterstring)
opjson['filters'].remove(filterstring)
runcontrol['log'].info(str(len(opjson['filters'])) + ' filters after processing allowed fspecs')
# OPP TEE M'EYES optimize if requested
if runcontrol['cmdlineargs'].optimizefilters:
FiltersOptimization(opjson, runcontrol)
opjson['api']['extended-info']['thisfileoptimized'] = True
# wrap it up
opjson['filters'].sort()
opjson['api']['file_group_count'] = len(opjson['filters'])
# in case allowed fspecs were used we should sort the removed filters
opjson['extended-data']['losslesstracking']['removedfilters'].sort()
runcontrol['log'].info(str(len(opjson['filters'])) + ' filters for final ouput (post-optimization if any)')
# generate a list of filters that are new since the last run
pconstjsondata = copy.deepcopy(pconstjsondata) # refresh our scratch local copy because our working copy has been polluted - not anymore, we can get rid of this
if runcontrol['cmdlineargs'].reloadfromsecondaryfilters: pconstjsondata['filters'] = [] # special case, --reloadfromsecondaryfilters blindly reloads new list of filters
for filterstring in opjson['filters']:
if filterstring not in pconstjsondata['filters']:
opjson['extended-data']['deltasinceprev'].append(filterstring)
runcontrol['log'].info(str(len(opjson['extended-data']['deltasinceprev'])) + ' new and/or modified filters added filters since last run')
# check if changes to filters, exceptions, and allowed lists, if no changes then we don't need to write new output files, set the flag to dry run
# if grooming only (which ignores secondary inputs) then we must write
# if we're forcing output of new JSON then we must write, the user may just want a new datestamp or whatever
# we could simplify this boolean, but let's shoot for clarity instead (non-inverted output NAND)
if (not runcontrol['cmdlineargs'].groomingonly) and (not runcontrol['cmdlineargs'].force):
if (opjson['filters'] == pconstjsondata['filters']) and (opjson['exceptions'] == pconstjsondata['exceptions']) and (opjson['extended-data']['excludefromfilters'] == pconstjsondata['extended-data']['excludefromfilters']):
runcontrol['log'].warning('Data files will not be written - new filters, allowed, and exceptions all match original input')
runcontrol['cmdlineargs'].dryrun = True
runcontrol['log'].debug('data processing completed')
def FiltersOptimization(jsondat:dict,runcontrol:dict)->None:
'''This handles advanced optimizations only.
Notes:
this optimization will move filters between various keys in the data dictionary, but the filters are never lost, entire process is lossless
optimizations are only as good as the regex strings found in the JSON data, we do our best to tell the user about the bad ones
during optimization there may be a lot messages generated, they're OK, optimizations can produce a lot of duplicates and rejections
the substring optimization only matches once on the the first regex it hits on, the rest will be ignored, to do in the future perhaps
same story for the whole string summarizations but they should only match once anyway if the regex was done right
!! hard coding note !! - the regex compilation is hard coded to ignore case
Inputs:
complete JSON dictionary that only needs to be optimized
runtime control baton
Outputs:
optimized JSON dictionary
'''
runcontrol['log'].info('applying optimizations')
runcontrol['log'].debug('optimizations note: rejections are normal, reevaluate regex as needed especially "stardotstar-prevent", "already-in-filters" can usually be ignorned')
regexsumsmatchinglist = [] # a list of tuples (pre-compiled regex, regex string, summarization string)
regexsubstringsubsmatchinglist = [] # a list of tuples (pre-compiled regex, regex string, subtitution substring)
killlist = [] # for filters that are just getting removed from the list, we can't nuke them inside the optimzation loop so build a list and do it last
# compile regex summarizations and build tuples
# !! note: skips invalid regex strings, it is safe to fail any string since this is an optimization feature
for reggydict in jsondat['extended-data']['regexsummarizations']:
for key in reggydict:
try:
regexsumsmatchinglist.append((re.compile(key, re.IGNORECASE),key, reggydict[key]))
except:
runcontrol['log'].warning('invalid regex string - skipping summarazation for: ' + key)
# compile regex substring substitutions and build tuples
# !! note: skips invalid regex strings, it is safe to fail any string since this is an optimization feature
for reggydict in jsondat['extended-data']['regexsubstringsubs']:
for key in reggydict:
try:
regexsubstringsubsmatchinglist.append((re.compile(key, re.IGNORECASE),key, reggydict[key]))
except:
runcontrol['log'].warning('Invalid regex string. Skipping summarazation for: ' + key)
# this for loop doesn't modify the jsondat['filters'], the actual string swaps happen later
for astring in jsondat['filters']:
## init this pass ##
#make working copy of string that may be modified at any step
fspecstring = astring
# control and message lists
optflag = False # an optimization occurred
denyopt = False # indicates a warning was generated and that the optimization was blocked/denied
reggymatches = [] # list of optimizations performed, for verbose output
optwarnings = [] # list of warnings generated for denial messages
## optimiztions ##
# make substring substitutions, !!! substring matches only first occurance !!!
for reggy in regexsubstringsubsmatchinglist:
if reggy[0].findall(fspecstring):
optflag = True
fspecstring = reggy[0].sub(reggy[2],fspecstring,count=1)
reggymatches.append(reggy[1])
# make summarization replacments
# NOTE: assumes wilcards and other substring replacements have already been applied, important
for reggy in regexsumsmatchinglist: # iterates through list of tuples with regex and summarization info
if reggy[0].findall(fspecstring):
optflag = True
fspecstring = reggy[2]
reggymatches.append(reggy[1])
## safeties ##
# once a denyopt has been set we can stop checking for additional reasons, very unlikely it will be more than one reason anyway
if optflag:
# stardotstarprevention
if fspecstring == "*.*":
denyopt = True
optwarnings.append('stardotstar-prevent')
# make sure we didn't regenerate an allowed fspec
elif fspecstring in jsondat['extended-data']['excludefromfilters']:
denyopt = True
optwarnings.append('in-excludefromfilters')
# test for optimization to same, dupe
elif (astring == fspecstring):
denyopt = True
optwarnings.append('repeat-of-self')
elif fspecstring in jsondat['filters']:
denyopt = True
optwarnings.append('already-in-filters')
reggymatchesmsg = ', '.join(reggymatches)
# if denied then tell them why
if denyopt:
optwarningsmsg = ', '.join(optwarnings)
runcontrol['log'].debug(astring +' -> '+fspecstring+' optimization regected, matching regex: '+reggymatchesmsg+' reject reasons: '+optwarningsmsg)
# if the fspec has been optimized and it didn't trigger a deny optimization
if optflag and not denyopt:
runcontrol['log'].debug(astring + ' >-optimized to-> ' + fspecstring + ' (matched regex: ' + reggymatchesmsg +')')
jsondat['extended-data']['losslesstracking']['removedfilters'].append(astring)
if len(fspecstring) > 0: # don't add if the optimization is an empty string
jsondat['extended-data']['losslesstracking']['addedfilters'].append(fspecstring)
# end of the optimization loop
# only optimization uses jsondat['extended-data']['losslesstracking']['addedfilters'], if there are any items in the list then we found something to optimize
if len(jsondat['extended-data']['losslesstracking']['addedfilters']) > 0:
# we expect lots of dupes in the added list because one of the goals is to summarize a bunch of those filters, dedupe before we do anything
jsondat['extended-data']['losslesstracking']['addedfilters'] = list(dict.fromkeys(jsondat['extended-data']['losslesstracking']['addedfilters']))
jsondat['extended-data']['losslesstracking']['addedfilters'].sort()
# sort the removed so they're pretty
jsondat['extended-data']['losslesstracking']['removedfilters'].sort()
# there should be no dupes in the removed list, processing the 'excludefromfilters' is not an optimization (1 to 1 relationship) and should have already been done, no need to dedupe here
for astring in jsondat['extended-data']['losslesstracking']['removedfilters']:
if astring in jsondat['filters']: jsondat['filters'].remove(astring)
# now we just add the optimations and sort
jsondat['filters'].extend(jsondat['extended-data']['losslesstracking']['addedfilters'])
jsondat['filters'].sort()
# optimization leaves lots of dupes in the filters and addedfilters, dedupe now that we're done
jsondat['filters'] = list(dict.fromkeys(jsondat['filters']))
jsondat['filters'].sort()
def WriteOpJSON(opjson:dict, runcontrol:dict)->None:
'''This writes the updated JSON data to file.
Notes:
!! hard coding note !! - the output encoding is hard coded to utf-8 text, JSON dumping escapes Unicode characters
don't let the 'ensure-ascii' in JSON dump fool you, it insures proper escaping but we're Unicode conforming from end to end
do not sort the JSON, we're trying to keep all the attributes in the same order that we set them up in the init
Inputs:
JSON dictionary
runtime control baton
Outputs:
none
returns - none
'''
runcontrol['log'].debug('writing output JSON file')
try:
if runcontrol['cmdlineargs'].compactjson:
jsondatstring = json.dumps(opjson, ensure_ascii=True)
else:
jsondatstring = json.dumps(opjson, ensure_ascii=True, sort_keys=False, indent=2)
except: # from json.dump, likely ValueError, could be TypeError, catch 'em all, any exception that's not OSError will be from JSON serializing
runcontrol['log'].error('error serializing output JSON data or other unhandled exception, output file will not be written')
try:
with open(runcontrol['opjsonpath'], 'w', encoding='utf-8') as outfilejson:
outfilejson.write(jsondatstring)
except OSError:
runcontrol['log'].error('failed to write ouput JSON file ' + str(runcontrol['opjsonpath']))
raise
except:
runcontrol['log'].error('unhandled exception writing ouput JSON file ' + str(runcontrol['opjsonpath']))
raise
else:
runcontrol['log'].info('JSON ouput data written to file: ' + runcontrol['opjsonpath'].name)
def WriteAncillaryTextFiles(opjson:dict, runcontrol:dict)->None:
'''This writes the ancillary text files. These files only contain the 'filters'.
Notes:
writes only the filters from the output JSON data
files are encoded with non-escaped utf-8 Unicode, no BOM
one text file has one filter per line
the other text file has filters in a format specifically for PowerShell scripts, wrapped in double quotes and comma seperated
to do - could a join of some sort to work for the script format? so much to learn
Inputs:
JSON dictionary
runtime control baton
Outputs:
none
returns - none
'''
try:
with open(runcontrol['opfiltersoneperlnpath'], 'w', encoding='utf-8') as outfiletext:
for listitem in opjson['filters']:
outfiletext.write(listitem+'\n')
except OSError:
runcontrol['log'].error('error writing ancillary text file ' + runcontrol['opfiltersoneperlnpath'].name)
except:
runcontrol['log'].error('unhandle exception writing ancillary text file ' + runcontrol['opfiltersoneperlnpath'].name)
else:
runcontrol['log'].info('ancillary text file written to file ' + runcontrol['opfiltersoneperlnpath'].name)
try:
with open(runcontrol['opfiltersPSlistpath'], 'w', encoding='utf-8') as outfiletext:
# we're going to kludge this just so I don't have to deal with getting the actual index
# this list has only unique entries so we'll string match to find the end
for listitem in opjson['filters']:
if (listitem != opjson['filters'][-1]):
outfiletext.write('"'+listitem+'"'+',')
else: # no trailing comma
outfiletext.write('"'+listitem+'"')
except OSError:
runcontrol['log'].error('error writing ancillary text file ' + runcontrol['opfiltersPSlistpath'].name)
except:
runcontrol['log'].error('unhandle exception writing ancillary text file ' + runcontrol['opfiltersPSlistpath'].name)
else:
runcontrol['log'].info('ancillary text written to file: ' + runcontrol['opfiltersPSlistpath'].name)
def RotateFiles(runcontrol:dict)->None:
'''This rotates and deletes all types of files that we've written in this pass.
Notes:
!! hard coding note !! - we are blindly changing the mode of any file to be deleted to RW
deletes oldest file based on its file name td stamp and not the OS file date
if ancillary files weren't written this pass then they won't be rotated
Inputs:
runtime control baton
Outputs:
none
returns - none
'''
# find all the matching input primary JSON files and select the newest
infilelist = list(runcontrol['workdirpath'].glob(runcontrol['jsonfnamewildcard']))
infilelist.sort()
for n in range(0, len(infilelist)-runcontrol['cmdlineargs'].rotatefilescount):
try:
# we're hard coding rw stat on the file, the user has implicitly authorized deletion by not setting --rotatefilescount to zero
(infilelist[n]).chmod(stat.S_IWRITE)
pathlib.Path.unlink(infilelist[n])
except:
runcontrol['log'].warning('unable to rotate/delete ' + infilelist[n].name)
else:
runcontrol['log'].info('rotated/deleted ' + infilelist[n].name)
# text files only rotated if text files were created in this run
if runcontrol['cmdlineargs'].ancillarytextfiles:
for akey in runcontrol['opfilterstextrotatekeys']:
# search and delete
infilelist = list(runcontrol['workdirpath'].glob(runcontrol[akey]))
infilelist.sort()
for n in range(0, len(infilelist)-runcontrol['cmdlineargs'].rotatefilescount):
try:
# we're hard coding rw stat on the file, the user has implicitly authorized deletion by not setting --rotatefilescount to zero and specifying --ancilarytextfiles
(infilelist[n]).chmod(stat.S_IWRITE)
pathlib.Path.unlink(infilelist[n])
except:
runcontrol['log'].warning('unable to rotate/delete ' + infilelist[n].name)
else:
runcontrol['log'].info('rotated/deleted ' + infilelist[n].name)
# MAIN #
# this try block catches normal argrparse SystemExit automatically raised by -h/--help and parsing errors, and manually raised by -V/--version
# SystemExit is a child of the BaseException class and not Exception, it will never get caught by "exception:"
try:
runtimecontrol = IntializeApp()
opjsondata = IntializeData(runtimecontrol)
# skeleton gets written to file, no other processing is required
if not runtimecontrol['cmdlineargs'].skeletononly:
if runtimecontrol['cmdlineargs'].groomingonly:
CombinedDataProcessing(opjsondata, FindLoadPrimaryJSON(runtimecontrol), None, runtimecontrol)
else:
CombinedDataProcessing(opjsondata, FindLoadPrimaryJSON(runtimecontrol), FindLoadSecondaryJSON(runtimecontrol), runtimecontrol)
# write all output files
if not runtimecontrol['cmdlineargs'].dryrun:
WriteOpJSON(opjsondata, runtimecontrol)
if runtimecontrol['cmdlineargs'].ancillarytextfiles:
WriteAncillaryTextFiles(opjsondata, runtimecontrol)
if runtimecontrol['cmdlineargs'].rotatefilescount > 0:
RotateFiles(runtimecontrol)
except SystemExit:
# only explicitly raised in InitializeApp(), never matches an 'except:' clause, we could just let it fall through the bottom of the program but I think we should be specific
raise
except:
# this is just to create a little visual separation at the command line and then re-raise any exceptions
# future to do: maybe put additional exception handling here
print()
raise