-
Notifications
You must be signed in to change notification settings - Fork 3
/
report_data_files.py
182 lines (141 loc) · 8.7 KB
/
report_data_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
'''
Copyright 2023 Flexera Software LLC
See LICENSE.TXT for full license text
SPDX-License-Identifier: MIT
Author : sgeary
Created On : Tue Aug 29 2023
File : report_data_files.py
'''
import logging, unicodedata, re
import common.api.project.get_scanned_files
import common.api.project.get_project_evidence
import SPDX_license_mappings
logger = logging.getLogger(__name__)
#-------------------------------------------------
def manage_file_details(baseURL, authToken, projectID, hasExtractedLicensingInfos, includeUnassociatedFiles):
filePathToID, fileDetails = get_scanned_file_details(baseURL, authToken, projectID, includeUnassociatedFiles)
fileDetails, hasExtractedLicensingInfos = get_file_evidence(baseURL, authToken, projectID, fileDetails, hasExtractedLicensingInfos)
return filePathToID, fileDetails, hasExtractedLicensingInfos
#-----------------------------
def get_scanned_file_details(baseURL, authToken, projectID, includeUnassociatedFiles):
filePathToID = {} # Allow for mapping from inventory file path to details about the file itself
filePathToID["inInventory"] = {}
filePathToID["notInInventory"] = {}
fileDetails = {}
# Collect a list of the scanned files
print(" + Collect data for all scanned files.")
logger.info(" Collect data for all scanned.")
scannedFiles = common.api.project.get_scanned_files.get_scanned_files_details_with_MD5_and_SHA1(baseURL, projectID, authToken)
print(" - Collected data for %s scanned file(s)." %len(scannedFiles))
logger.info(" Collected data for %s files." %len(scannedFiles))
# Cycle through each scanned file
for scannedFile in scannedFiles:
scannedFileDetails = {}
scannedFileId = scannedFile["fileId"]
fileName = scannedFile["filePath"]
inInventory = scannedFile["inInventory"]
remoteFile = scannedFile["remote"]
# # Don't collect any data for the files we don't care about
# if inInventory == "false" and not includeUnassociatedFiles:
# continue
# Create a unique identifier based on fileID and scan location
if remoteFile == "false":
uniqueFileID = str(scannedFileId) + "-s"
else:
uniqueFileID = str(scannedFileId) + "-r"
scannedFileDetails["SPDXID"] = "SPDXRef-File-" + uniqueFileID
scannedFileDetails["fileName"] = fileName
scannedFileDetails["checksums"] = []
if scannedFile["fileMD5"] is not None:
checksum = {}
checksum["algorithm"] = "MD5"
checksum["checksumValue"] = scannedFile["fileMD5"]
scannedFileDetails["checksums"].append(checksum)
if scannedFile["fileSHA1"] is not None:
checksum = {}
checksum["algorithm"] = "SHA1"
checksum["checksumValue"] = scannedFile["fileSHA1"]
scannedFileDetails["checksums"].append(checksum)
scannedFileDetails["licenseConcluded"] = "NOASSERTION" # TODO - Requires custom fields at file level
fileDetails[uniqueFileID] = scannedFileDetails
# Create a mapping beteween the filepath and its ID and SHA1
filePathDetails = {}
filePathDetails["uniqueFileID"] = uniqueFileID
filePathDetails["fileSHA1"] = scannedFile["fileSHA1"]
if inInventory == "true":
filePathToID["inInventory"][fileName] = filePathDetails
else:
filePathToID["notInInventory"][fileName] = filePathDetails
return filePathToID, fileDetails
#-----------------------------
def get_file_evidence(baseURL, authToken, projectID, fileDetails, hasExtractedLicensingInfos):
# Collect the copyright/license data per file and create dict based on
print(" + Collect file level evidence.")
logger.info(" + Collect file level evidence")
projectEvidenceDetails = common.api.project.get_project_evidence.get_project_evidence(baseURL, projectID, authToken)
print(" - File level evidence has been collected.")
logger.info(" - File level evidence has been collected.")
for fileEvidenceDetails in projectEvidenceDetails["data"]:
remoteFile = bool(fileEvidenceDetails["remote"])
scannedFileId = fileEvidenceDetails["scannedFileId"]
copyrightEvidenceFound= fileEvidenceDetails["copyRightMatches"]
licenseEvidenceFound = list(set(fileEvidenceDetails["licenseMatches"]))
uniqueFileID = str(scannedFileId) + ("-r" if remoteFile else "-s")
# If the fileID is not in the dict move on (includeUnassociatedFiles?)
if uniqueFileID not in fileDetails:
continue
##########################################
# Manage File Level Copyrights
# Normalize the copyrights in case there are any encoding issues
copyrightEvidenceFound = [unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode('utf-8') for x in copyrightEvidenceFound]
if copyrightEvidenceFound:
logger.info(" Copyright evidence discovered")
# The response has the copyright details as a list so convert to a string
copyrightEvidenceFound = " | ".join(copyrightEvidenceFound)
else:
logger.info(" No copyright evidence discovered")
copyrightEvidenceFound = "NONE"
##########################################
# Manage File Level Licenses
if licenseEvidenceFound:
# Remove duplicates and sort data
licenseEvidenceFound = sorted(list(dict.fromkeys(licenseEvidenceFound)))
# Remove Public Domain license if present
if "Public Domain" in licenseEvidenceFound:
licenseEvidenceFound.remove("Public Domain")
# The license evidence is not in SPDX form so consolidate and map
for index, licenseEvidence in enumerate(licenseEvidenceFound):
if licenseEvidence in SPDX_license_mappings.LICENSEMAPPINGS:
licenseEvidenceFound[index] = SPDX_license_mappings.LICENSEMAPPINGS[licenseEvidence]
logger.info(" \"%s\" maps to SPDX ID: \"%s\"" %(licenseEvidence, SPDX_license_mappings.LICENSEMAPPINGS[licenseEvidence] ))
else:
# There was not a valid SPDX license name returned
logger.warning(" \"%s\" is not a valid SPDX identifier for file level license. - Using LicenseRef." %(licenseEvidence))
licenseEvidence = licenseEvidence.split("(", 1)[0].rstrip() # If there is a ( in string remove everything after and space
licenseEvidence = re.sub('[^a-zA-Z0-9 \n\.]', '-', licenseEvidence) # Replace spec chars with dash
licenseEvidence = licenseEvidence.replace(" ", "-") # Replace space with dash
licenseReference = "LicenseRef-%s" %licenseEvidence
licenseEvidenceFound[index] = licenseReference
licenseReferenceComment = "SCA Revenera - Observed license details within file"
# Since this is an non SPDX ID we need to add to the hasExtractedLicensingInfos section
if licenseReference not in hasExtractedLicensingInfos:
# It's not there so create a new entry
hasExtractedLicensingInfos[licenseReference] = {}
hasExtractedLicensingInfos[licenseReference]["licenseId"] = licenseReference
hasExtractedLicensingInfos[licenseReference]["name"] = licenseEvidence
hasExtractedLicensingInfos[licenseReference]["extractedText"] = licenseEvidence
hasExtractedLicensingInfos[licenseReference]["comment"] = [licenseReferenceComment]
else:
# It's aready there but is the comment the same as any previous entry
if licenseReferenceComment not in hasExtractedLicensingInfos[licenseReference]["comment"]:
hasExtractedLicensingInfos[licenseReference]["comment"].append(licenseReferenceComment)
else:
licenseEvidenceFound = ["NONE"]
# Catch any corner cases?
if len(licenseEvidenceFound) == 0:
licenseEvidenceFound = ["NONE"]
# Add the evidence details to the appropriate area for this file
fileDetails[uniqueFileID]["copyrightText"]= copyrightEvidenceFound
fileDetails[uniqueFileID]["licenseConcluded"]= "NOASSERTION"
fileDetails[uniqueFileID]["licenseInfoInFiles"]= licenseEvidenceFound
return fileDetails, hasExtractedLicensingInfos