Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Public version with TF2 support #7

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ Framework composed of a collection of python script to run, profile and collect
The framework can be used with different machines and different applicaitons.
The target architecture already supported by this version are:
- inhouse server
- Microsoft Azure VMs
- local machine

The application already supported by this version are:
- CNN training with pytorch
- CNN training with tensorflow
- CNN and RNN training with tensorflow

The framework can be configured via .ini configuration file.
An example of configuration file is available in configurations/default.ini.
Expand Down
Binary file added __pycache__/utility.cpython-36.pyc
Binary file not shown.
Binary file added __pycache__/utility.cpython-37.pyc
Binary file not shown.
Binary file added __pycache__/utility.cpython-39.pyc
Binary file not shown.
Binary file added apps/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added apps/__pycache__/tf.cpython-36.pyc
Binary file not shown.
Binary file added apps/__pycache__/tf.cpython-37.pyc
Binary file not shown.
Binary file added apps/__pycache__/tf.cpython-39.pyc
Binary file not shown.
75 changes: 75 additions & 0 deletions apps/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Copyright 2019 Marco Lattuada

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import logging
import os
import sys

import xmltodict

def load_xml_configuration(parameters, application, root_tag):
"""
Create a configuration dictionary combining comma-seperated list of parameters and xml file"

Parameters
----------
parameters: str
A comma-separated list of parameters in the form parameter=value

application: str
The name of the application

root_tag: str
The name of the root tag of the xml file.

Return
------
dict of str: dict of str: str
A dictionary containing the combination of input parameters and default configuration file
"""
configuration_base = "default"
#First look for configuration
for parameter in parameters.split(","):
if len(parameter.split("=")) != 2:
logging.error("parameters must be a , seperated list of <parameter>=<value>: %s", parameter)
sys.exit(1)
if parameter.split("=")[0] == "configuration":
configuration_base = parameter.split("=")[1]
break

utility = __import__("utility")
root_project = utility.get_project_root()

#The absolute path of the configuration directory
confs_dir = os.path.join(root_project, "apps", application, "confs")
logging.info("conf directory is %s", confs_dir)

#Check the confs_dir exists
if not os.path.exists(confs_dir):
logging.error("Conf directory %s does not exist", confs_dir)
sys.exit(1)

#Check if xml file of the conf exist
xml_file_name = os.path.join(confs_dir, configuration_base + ".xml")
if not os.path.exists(xml_file_name):
logging.error("XML file %s not found", xml_file_name)
sys.exit(1)


#Load XML file
with open(xml_file_name) as xml_file:
doc = xmltodict.parse(xml_file.read(), force_list={'input_class'})
return doc[root_tag]
30 changes: 30 additions & 0 deletions apps/hostname.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,39 @@
import socket

def compute_configuration_name(cl_parameters):
"""
Compute the configuration name. Since hostname does not take any parameter, there can be only one configuration named "no_parameters"

Paramters
---------
cl_parameters: str
Added for generality with respect to other applications.

Return
------
str
return "no_parameters"
"""
return "no_parameters"

def collect_data(repetition_path, gpu_type, gpu_number, debug):
"""
Add to csv (and creates it if it does not exist) data about the experiment whose output was saved in repetition_path

Parameters
----------
repetition_path: str
The path containing the output of the currently analyzed experiment

gpu_type: str
The type of the GPU

gpu_number: str
The number of the GPUs of the VM

debug: boolean
True if debug messages have to be printed
"""
csv_file_name = "hostname.csv"
if os.path.exists(csv_file_name):
csv_file = open(csv_file_name, "a")
Expand Down
131 changes: 72 additions & 59 deletions apps/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python3
#!/usr/bin/env python3
"""
Copyright 2018 Marco Lattuada

Expand Down Expand Up @@ -29,33 +29,49 @@

import dicttoxml

def compute_parameters(cl_parameters):
configuration_base = "default"
#First look for configuration
for cl_parameter in cl_parameters.split(","):
if len(cl_parameter.split("=")) != 2:
logging.error("parameters must be a , seperated list of <parameter>=<value>: %s", cl_parameter)
sys.exit(1)
if cl_parameter.split("=")[0] == "configuration":
configuration_base = cl_parameter.split("=")[1]
break

#Load configuration
parameters = load_xml_configuration(configuration_base + ".xml")["pytorch_configuration"]
import app

def compute_parameters(cl_parameters):
"""
Combine the parameters of the single experiment with default values

Parameters
----------
cl_parameter: str
A comma-separated list of parameter=value characterizing the experiment

Return
------
dict of str: str
The dictionary containing the values of all the parameters
"""
parameters = app.load_xml_configuration(cl_parameters, "pytorch", "pytorch_configuration")
#Overwrite parameters
for cl_parameter in cl_parameters.split(","):
tokens = cl_parameter.split("=")
if len(tokens) != 2:
logging.error("parameters must be a , seperated list of <parameter>=<value>")
sys.exit(1)
if not tokens[0] in parameters and tokens[0] != "configuration" and tokens[0] != "gpus_number" and tokens[0] != "n" and tokens[0] != "gpus_instances":
logging.error("parameter %s is not present in the source configuration", tokens[0])
sys.exit(1)
#if not tokens[0] in parameters and tokens[0] != "configuration" and tokens[0] != "gpus_number" and tokens[0] != "n" and tokens[0] != "gpus_instances":
# logging.error("parameter %s is not present in the source configuration", tokens[0])
# sys.exit(1)
parameters[tokens[0]] = tokens[1]
return parameters

def compute_configuration_name(cl_parameters):
"""
Compute the configuration name on the basis of the values of the experiment parameters

Parameters
---------
cl_parameters: str
A comma separated list of parameter=value

Return
------
str
The configuration name
"""
parameters = compute_parameters(cl_parameters)
if "gpus_number" in parameters:
gpus_number = "_gpus_number_" + parameters["gpus_number"]
Expand All @@ -75,36 +91,24 @@ def compute_configuration_name(cl_parameters):
configuration_name = network_type + "_cl_" + parameters["num_classes"] + "_im_" + parameters["images_per_class"] + "_ep_" + parameters["epochs_number"] + "_bs_" + parameters["batch_size"] + "_mo_" + parameters["momentum"] + "_j_" + parameters["j"] + gpus_number + only_load
return configuration_name

def load_xml_configuration(xml_configuration_file):
#The absolute path of the current file
abs_script = os.path.realpath(__file__)

#The root directory of the script
abs_root = os.path.dirname(abs_script)

#The absolute path of the configuration directory
confs_dir = os.path.join(abs_root, "pytorch", "confs")
logging.info("conf directory is %s", confs_dir)

#Check the confs_dir exists
if not os.path.exists(confs_dir):
logging.error("Conf directory %s does not exist", confs_dir)
sys.exit(1)

#Check if xml file of the conf exist
xml_file_name = os.path.join(confs_dir, xml_configuration_file)
if not os.path.exists(xml_file_name):
logging.error("XML file %s not found", xml_file_name)
sys.exit(1)
def collect_data(repetition_path, gpu_type, gpu_number, debug):
"""
Add to csv (and creates it if it does not exist) data about the experiment whose output was saved in repetition_path

Parameters
----------
repetition_path: str
The path containing the output of the currently analyzed experiment

#Load XML file
with open(xml_file_name) as xml_file:
doc = xmltodict.parse(xml_file.read(), force_list={'input_class'})
return doc
gpu_type: str
The type of the GPU

gpu_number: str
The number of the GPUs of the VM

def collect_data(repetition_path, gpu_type, gpu_number, debug):
debug: boolean
True if debug messages have to be printed
"""
try:
#The iterations fractions
iteration_fractions = [0.25, 0.50, 0.75]
Expand Down Expand Up @@ -319,10 +323,10 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
sys.exit(1)

#Preparing csv file with cpu and gpu utilization
profile_cpu_output_filename = os.path.join(repetition_path, "profile_cpu_output")
profile_gpu_output_filename = os.path.join(repetition_path, "profile_gpu_output")
profile_file_name_cpu = os.path.join("pytorch_csvs", "profile_cpu_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_file_name_sum_cpu = os.path.join("pytorch_csvs", "profile_sum_cpu_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_cpu_output_filename = os.path.join(repetition_path, "profile_CPU_output")
profile_gpu_output_filename = os.path.join(repetition_path, "profile_GPU_output")
profile_file_name_cpu = os.path.join("pytorch_csvs", "profile_CPU_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_file_name_sum_cpu = os.path.join("pytorch_csvs", "profile_sum_CPU_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
if os.path.exists(profile_cpu_output_filename) and (not os.path.exists(profile_file_name_cpu) or not os.path.exists(profile_file_name_sum_cpu)):
#The collected data
cpu_data = {}
Expand All @@ -333,10 +337,10 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
#Analyzing profile_cpu_output
for line in open(profile_cpu_output_filename, "r"):
#New entry
if line.find("%cpu %MEM ARGS") != -1:
if line.find("%CPU %MEM ARGS") != -1:
previous_timestamp = current_timestamp
#Old pattern
if line.startswith("%cpu %MEM ARGS"):
if line.startswith("%CPU %MEM ARGS"):
split = line.split()
if len(split) == 5:
read_timestamp = split[3] + " " + split[4][0:7]
Expand All @@ -350,7 +354,7 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
current_timestamp = str(int(current_timestamp_datetime.timestamp()))
#New pattern
else:
split = line.replace("\\n%cpu %MEM ARGS", "").split()
split = line.replace("\\n%CPU %MEM ARGS", "").split()
current_timestamp_readable = split[4] + " " + split[5]
current_timestamp = split[1]
logging.debug("Found timestamp %s (%s(", current_timestamp, current_timestamp_readable)
Expand Down Expand Up @@ -401,8 +405,8 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
profile_sum_file.write(str(cpu_sum_data["cpu" + str(cpu_number)]))
profile_sum_file.write("\n")
profile_sum_file.close()
profile_file_name_gpu = os.path.join("pytorch_csvs", "profile_gpu_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_file_name_sum_gpu = os.path.join("pytorch_csvs", "profile_sum_gpu_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_file_name_gpu = os.path.join("pytorch_csvs", "profile_GPU_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")
profile_file_name_sum_gpu = os.path.join("pytorch_csvs", "profile_sum_GPU_" + gpu_type.replace(" ", "-") + "_" + str(gpu_number) + "_" + configuration_path + "_" + experiment_path + "_" + str(starting_timestamp) + ".csv")

if os.path.exists(profile_gpu_output_filename) and (not os.path.exists(profile_file_name_gpu) or not os.path.exists(profile_file_name_sum_gpu)):
#The collected data
Expand Down Expand Up @@ -544,10 +548,10 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
only_load = "0"

#Retrieving machine information
#Add host_scripts to the directories for python packages search
#Add host_scripts to the directories for python modules search
host_scripts_path = os.path.join(abs_root, "..", "host_scripts")
sys.path.append(host_scripts_path)
collect_data_package = __import__("collect_data")
collect_data_module = __import__("collect_data")

mac_address = ""
system_uuid = ""
Expand All @@ -562,7 +566,7 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
if xml_configuration.get("system_UUID"):
system_uuid = xml_configuration.get("system_UUID")

machine_information = collect_data_package.get_machine_information(mac_address, machine_name, system_uuid)
machine_information = collect_data_module.get_machine_information(mac_address, machine_name, system_uuid)
mac_address = machine_information["mac_address"]
system_uuid = machine_information["system_uuid"]
machine_name = machine_information["machine_name"]
Expand Down Expand Up @@ -643,12 +647,24 @@ def collect_data(repetition_path, gpu_type, gpu_number, debug):
raise

def main():
"""
The wrapper script for training a CNN on ImageNet dataset with PyTorch

The parameters are:
-d, --debug: enables the printing of the debug messages
-p, --parameters: a comma-separated list of parameters to be passed to the wrapped application
--no-clean: if True, removal of generated files (e.g., dumping of weights) is disabled
"""
#The absolute path of the current file
abs_script = os.path.realpath(__file__)

#The root directory of the script
abs_root = os.path.dirname(abs_script)

sys.path.append(os.path.join(abs_root, ".."))
utility = __import__("utility")


#The return value of the command
return_value = 0

Expand Down Expand Up @@ -841,10 +857,7 @@ def main():
logging.warning("/etc/machine-id does not exists")
else:
uuid_line = open("/etc/machine-id", "r").readline()
if len(uuid_line.split()) != 2:
logging.error("Error in loading uuid: %s", str(uuid_line.split()))
sys.exit(1)
uuid = uuid_line.split()[1]
uuid = uuid_line

root["system_UUID"] = uuid

Expand Down
1 change: 1 addition & 0 deletions apps/pytorch/confs/remote_ex
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
configuration=test_small
Loading