Skip to content

Commit

Permalink
Add profiling docs with perf, gperftools, VTUne
Browse files Browse the repository at this point in the history
  • Loading branch information
robertodr committed Mar 27, 2018
1 parent 7c7bfbb commit ccbb7f7
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 76 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ include(autocmake_ccache)
include(windows)
include(autocmake_definitions)
include(code_coverage)
include(gperftools)
include(autocmake_int64)
include(autocmake_omp)
include(autocmake_safeguards)
Expand Down
1 change: 1 addition & 0 deletions cmake/autocmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ modules:
- 'custom/windows.cmake'
- '%(url_root)modules/definitions.cmake'
- 'custom/code_coverage.cmake'
- 'custom/gperftools.cmake'
- '%(url_root)modules/int64.cmake'
- '%(url_root)modules/omp.cmake'
- '%(url_root)modules/safeguards.cmake'
Expand Down
58 changes: 58 additions & 0 deletions cmake/custom/FindLibunwind.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# This file is part of MADNESS
# https://github.com/m-a-d-n-e-s-s/madness/blob/master/cmake/modules/FindLibunwind.cmake
#
# - Try to find Libunwind
# Input variables:
# LIBUNWIND_ROOT_DIR - The libunwind install directory
# LIBUNWIND_INCLUDE_DIR - The libunwind include directory
# LIBUNWIND_LIBRARY - The libunwind library directory
# Output variables:
# LIBUNWIND_FOUND - System has libunwind
# LIBUNWIND_INCLUDE_DIRS - The libunwind include directories
# LIBUNWIND_LIBRARIES - The libraries needed to use libunwind
# LIBUNWIND_VERSION - The version string for libunwind

include(FindPackageHandleStandardArgs)

if(NOT DEFINED LIBUNWIND_FOUND)

# Set default sarch paths for libunwind
if(LIBUNWIND_ROOT_DIR)
set(LIBUNWIND_INCLUDE_DIR ${LIBUNWIND_ROOT_DIR}/include CACHE PATH "The include directory for libunwind")
if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib64;${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
else()
set(LIBUNWIND_LIBRARY ${LIBUNWIND_ROOT_DIR}/lib CACHE PATH "The library directory for libunwind")
endif()
endif()

find_path(LIBUNWIND_INCLUDE_DIRS NAMES libunwind.h
HINTS ${LIBUNWIND_INCLUDE_DIR})

find_library(LIBUNWIND_LIBRARIES unwind
HINTS ${LIBUNWIND_LIBRARY})

# Get libunwind version
if(EXISTS "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h")
file(READ "${LIBUNWIND_INCLUDE_DIRS}/libunwind-common.h" _libunwind_version_header)
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MAJOR[ \t]+([0-9]+).*" "\\1"
LIBUNWIND_MAJOR_VERSION "${_libunwind_version_header}")
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_MINOR[ \t]+([0-9]+).*" "\\1"
LIBUNWIND_MINOR_VERSION "${_libunwind_version_header}")
string(REGEX REPLACE ".*define[ \t]+UNW_VERSION_EXTRA[ \t]+([0-9]+).*" "\\1"
LIBUNWIND_MICRO_VERSION "${_libunwind_version_header}")
set(LIBUNWIND_VERSION "${LIBUNWIND_MAJOR_VERSION}.${LIBUNWIND_MINOR_VERSION}.${LIBUNWIND_MICRO_VERSION}")
unset(_libunwind_version_header)
endif()

# handle the QUIETLY and REQUIRED arguments and set LIBUNWIND_FOUND to TRUE
# if all listed variables are TRUE
find_package_handle_standard_args(Libunwind
FOUND_VAR LIBUNWIND_FOUND
VERSION_VAR LIBUNWIND_VERSION
REQUIRED_VARS LIBUNWIND_LIBRARIES LIBUNWIND_INCLUDE_DIRS)

mark_as_advanced(LIBUNWIND_INCLUDE_DIR LIBUNWIND_LIBRARY
LIBUNWIND_INCLUDE_DIRS LIBUNWIND_LIBRARIES)

endif()
30 changes: 16 additions & 14 deletions cmake/custom/gperftools.cmake
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
if(ENABLE_GPERFTOOLS OR ENABLE_TCMALLOC_MINIMAL)
#.rst:
#
# Enable profiling with gperftools.
#
# Variables used::
#
# ENABLE_GPERFTOOLS
#
# autocmake.yml configuration::
#
# docopt: "--gperf Enable profiling with gperftools [default: False]."
# define: "'-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf'])"

if(ENABLE_GPERFTOOLS)
find_package(Gperftools COMPONENTS tcmalloc OPTIONAL_COMPONENTS profiler)
else()
find_package(Gperftools REQUIRED COMPONENTS tcmalloc_minimal)
endif()

# Set the config.h variables
if(GPERFTOOLS_FOUND AND ENABLE_TCMALLOC_MINIMAL)
set(MADNESS_HAS_GOOGLE_PERF_MINIMAL 1)
endif()
if(LIBUNWIND_FOUND)
set(MADNESS_HAS_LIBUNWIND 1)
endif()
option_with_print(ENABLE_GPERFTOOLS "Enable profiling with gperftools" OFF)

if(ENABLE_GPERFTOOLS)
message(STATUS "Linking against gperftools libraries for profiling")
find_package(Gperftools COMPONENTS tcmalloc profiler)
endif()
2 changes: 1 addition & 1 deletion cmake/downloaded/autocmake_omp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ if(ENABLE_OPENMP)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -mp")
endif()
if(CMAKE_Fortran_COMPILER_ID MATCHES XL)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qsmp")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qsmp=omp")
endif()
if(CMAKE_Fortran_COMPILER_ID MATCHES Cray)
# do nothing in this case
Expand Down
5 changes: 2 additions & 3 deletions default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ let
owner = "NixOS";
repo = "nixpkgs-channels";
rev = "nixos-unstable";
sha256 = "1p0xxyz30bd2bg0wrfviqgsskz00w647h0l2vi33w90i42k8r3li";
sha256 = "0im8l87nghsp4z7swidgg2qpx9mxidnc0zs7a86qvw8jh7b6sbv2";
});
in
with import nixpkgs {
Expand Down Expand Up @@ -34,9 +34,7 @@ in
doxygen
exa
ffmpeg
flameGraph
gfortran
gperftools
graphviz
lcov
pipenv
Expand All @@ -54,5 +52,6 @@ in
shellHook = ''
export NINJA_STATUS="[Built edge %f of %t in %e sec]"
SOURCE_DATE_EPOCH=$(date +%s)
source $(pipenv --venv)/bin/activate
'';
}
173 changes: 173 additions & 0 deletions doc/programmers/profiling.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
Profiling
---------

You should obtain profiling information before attempting any optimization of
the code. There are many ways of obtaining this information, but we have only
experimented with the following:

#. Using Linux ``perf`` and related `tools <http://www.brendangregg.com/perf.html>`_.
#. Using ``gperftools``.
#. Using Intel VTune.

Profiling should be done using the standalone executable ``run_pcm`` and any of
the input files gathered under the ``tests/benchmark`` directory. These files
are copied to the build directory. If you are lazy, you can run the profiling
from the build directory:

.. code-block:: bash
>>> cd tests/benchmark
>>> env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
Using ``perf``
==============

``perf`` is a tool available on Linux. Though part of the kernel tools, it is
not usually preinstalled on most Linux distributions. For visualization
purposes we also need `additional tools <https://github.com/brendangregg/perf-tools>`_,
in particular the `flame graph generation scripts <https://github.com/brendangregg/FlameGraph>`_
Probably your distribution has them prepackaged already.
``perf`` will trace all CPU events on your system, hence you might need to
fiddle with some kernel set up files to get permissions to trace events.

.. note::
``perf`` **is NOT** available on ``stallo``. Even if it were, you would
probably not have permissions to record kernel traces.

These are the instructions I used:

1. Trace execution. This will save CPU stack traces to a ``perf.data`` file.
Successive runs do not overwrite this file.

.. code-block:: bash
>>> cd tests/benchmark
>>> perf record -F 99 -g -- env PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH python
<build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
2. Get reports. There are different ways of getting a report from the
``perf.data`` file. The following will generate a call tree.

.. code-block:: bash
>>> perf report --stdio
3. Generate an interactive flame graph.

.. code-block:: bash
>>> perf script | stackcollapse-perf.pl > out.perf-folded
>>> cat out.perf-folded | flamegraph.pl > perf-run_pcm.svg
Using ``gperftools``
====================

This set of tools was previously known as Google Performance Tools. The
executable needs to be linked against the ``profiler``, ``tcmalloc``
and ``unwind`` libraries.
CMake will attempt to find them. If this fails, you will have to install them,
you should either check if they are available for your distribution or compile
from source.
In principle, one could use the ``LD_PRELOAD`` mechanism to skip the *ad hoc*
compilation of the executable.

.. note::
``gperftools`` **is** available on ``stallo``, but it's an ancient version.

1. Configure the code with the ``--gperf`` option enabled. CPU and heap
profiling, together with heap-checking will be available.

2. CPU profiling can be done with the following command:

.. code-block:: bash
>>> env CPUPROFILE=run_pcm.cpu.prof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
This will save the data to the ``run_pcm.cpu.prof`` file. To analyze the gathered
data we can use the ``pprof`` script:

.. code-block:: bash
>>> pprof --text <build_dir>/bin/run_pcm run_pcm.cpu.prof
This will print a table. Any row will look like the following:

.. code-block:: bash
2228 7.2% 24.8% 28872 93.4% pcm::utils::splineInterpolation
where the columns respectively report:

#. Number of profiling samples in this function.
#. Percentage of profiling samples in this function.
#. Percentage of profiling samples in the functions printed so far.
#. Number of profiling samples in this function and its callees.
#. Percentage of profiling samples in this function and its callees.
#. Function name.

For more details look `here <https://gperftools.github.io/gperftools/cpuprofile.html>`_

3. Heap profiling can be done with the following command:

.. code-block:: bash
>>> env HEAPPROFILE=run_pcm.hprof PYTHONPATH=<build_dir>/lib64/python:$PYTHONPATH
python <build_dir>/bin/go_pcm.py --inp=standalone.pcm --exe=<build_dir>/bin
This will output a series of datafiles ``run_pcm.hprof.0000.heap``,
``run_pcm.hprof.0001.heap`` and so forth. You will have to kill execution
when enough samples have been collected.
Analysis of the heap profiling data can be done using ``pprof``. `Read more
here <https://gperftools.github.io/gperftools/heapprofile.html>`_


Using Intel VTune
=================

This is probably the easiest way to profile the code.
`VTune <https://software.intel.com/en-us/intel-vtune-amplifier-xe>`_ is Intel software, it might be possible to get a personal, free license.
The instructions will hold on any machine where VTune is installed and you can
look for more details on the `online documentation <https://software.intel.com/en-us/vtune-amplifier-help>`_
You can, in principle, use the GUI. I haven't managed to do that though.

On ``stallo``, start an interactive job and load the following modules:

.. code-block:: bash
>>> module load intel/2018a
>>> module load CMake
>>> module load VTune
>>> export BOOST_INCLUDEDIR=/home/roberto/Software/boost/include
>>> export BOOST_LIBRARYDIR=/home/roberto/Software/boost/lib
You will need to compile with optimizations activated, *i.e.* release mode.
It is better to first parse the input file and then call ``run_pcm``:

.. code-block:: bash
>>> cd <build_dir>/tests/benchmark
>>> env PYTHONPATH=../../lib64/python:$PYTHONPATH
python ../../bin/go_pcm.py --inp=standalone_bubble.pcm
To start collecting hotspots:

.. code-block:: bash
>>> amplxe-cl -collect hotspots ../../bin/run_pcm @standalone_bubble.pcm
VTune will generate a folder ``r000hs`` with the collected results. A report
for the hotspots can be generated with:

.. code-block:: bash
>>> amplxe-cl -report hotspots -r r000hs > report
1 change: 1 addition & 0 deletions doc/programmers/programmers-manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ PCMSolver Programmers' Manual
documentation
cmake-usage
maintenance
profiling
testing
timer-class
17 changes: 8 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from autocmake import configure
from autocmake.external import docopt


options = """
Usage:
./setup.py [options] [<builddir>]
Expand All @@ -27,6 +26,7 @@
--ccache=<USE_CCACHE> Toggle use of ccache <ON/OFF> [default: ON].
--add-definitions=<STRING> Add preprocesor definitions [default: ''].
--coverage Enable code coverage [default: OFF].
--gperf Enable profiling with gperftools [default: False].
--int64 Enable 64bit integers [default: False].
--omp Enable OpenMP parallelization [default: False].
--python=<PYTHON_INTERPRETER> The Python interpreter (development version) to use. [default: ''].
Expand All @@ -53,12 +53,16 @@ def gen_cmake_command(options, arguments):
"""
command = []
command.append(arguments['--cmake-executable'])
command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'], arguments['--extra-fc-flags']))
command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'], arguments['--extra-cc-flags']))
command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'], arguments['--extra-cxx-flags']))
command.append('-DCMAKE_Fortran_COMPILER={0} -DEXTRA_FCFLAGS="{1}"'.format(arguments['--fc'],
arguments['--extra-fc-flags']))
command.append('-DCMAKE_C_COMPILER={0} -DEXTRA_CFLAGS="{1}"'.format(arguments['--cc'],
arguments['--extra-cc-flags']))
command.append('-DCMAKE_CXX_COMPILER={0} -DEXTRA_CXXFLAGS="{1}"'.format(arguments['--cxx'],
arguments['--extra-cxx-flags']))
command.append('-DUSE_CCACHE={0}'.format(arguments['--ccache']))
command.append('-DPREPROCESSOR_DEFINITIONS="{0}"'.format(arguments['--add-definitions']))
command.append('-DENABLE_CODE_COVERAGE={0}'.format(arguments['--coverage']))
command.append('-DENABLE_GPERFTOOLS={0}'.format(arguments['--gperf']))
command.append('-DENABLE_64BIT_INTEGERS={0}'.format(arguments['--int64']))
command.append('-DENABLE_OPENMP={0}'.format(arguments['--omp']))
command.append('-DPYTHON_INTERPRETER="{0}"'.format(arguments['--python']))
Expand Down Expand Up @@ -88,22 +92,17 @@ def gen_cmake_command(options, arguments):
sys.stderr.write(options)
sys.exit(-1)


# use extensions to validate/post-process args
if configure.module_exists('extensions'):
import extensions
arguments = extensions.postprocess_args(sys.argv, arguments)


root_directory = os.path.dirname(os.path.realpath(__file__))


build_path = arguments['<builddir>']


# create cmake command
cmake_command = '{0} -H{1}'.format(gen_cmake_command(options, arguments), root_directory)


# run cmake
configure.configure(root_directory, build_path, cmake_command, arguments['--show'])
3 changes: 3 additions & 0 deletions src/bin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ if(STATIC_LIBRARY_ONLY)
else()
target_link_libraries(run_pcm pcm-shared)
endif()
if(ENABLE_GPERFTOOLS)
target_link_libraries(run_pcm ${GPERFTOOLS_LIBRARIES})
endif()
target_compile_options(run_pcm
PRIVATE
"$<$<CONFIG:DEBUG>:${EXDIAG_CXX_FLAGS}>"
Expand Down
Loading

0 comments on commit ccbb7f7

Please sign in to comment.