Reports: Including CPU instr and cycle counters in timing on native Linux

* With appropriate configuration on Linux this allows to get at very
  precise timing configuration so we can judge even small compile time
  improvements correctly. We then don't need many runs to average out
  noise from other effects.

* Don't use wall clock but process time for steps that are not doing
  IO like module optimization for more accurate values otherwise, it
  is however not very accurate still.
This commit is contained in:
Kay Hayen
2025-11-17 20:54:39 +01:00
parent e0c60a3fd0
commit c2482c7a6d
9 changed files with 323 additions and 21 deletions

View File

@@ -137,6 +137,7 @@
"Anext",
"appdirs",
"argmap",
"armv7l",
"Asend",
"asyncgen",
"asyncgens",
@@ -418,6 +419,7 @@
"strrchr",
"stubgen",
"subclasscheck",
"syscall",
"sysexit",
"sysflag",
"terminfo",

View File

@@ -51,7 +51,7 @@ def hasCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):
# Bump this is format is changed or enhanced implementation might different ones.
_cache_format_version = 7
_cache_format_version = 8
def getCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):
@@ -121,8 +121,8 @@ def getCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):
# The Json doesn't store integer keys.
for pass_timing_info in data["timing_infos"]:
pass_timing_info[3] = dict(
(int(key), value) for (key, value) in pass_timing_info[3].items()
pass_timing_info[5] = dict(
(int(key), value) for (key, value) in pass_timing_info[5].items()
)
return result, data["timing_infos"]

View File

@@ -297,12 +297,19 @@ module_timing_infos = {}
ModuleOptimizationTimingInfo = makeNamedtupleClass(
"ModuleOptimizationTimingInfo",
("pass_number", "time_used", "micro_passes", "merge_counts"),
(
"pass_number",
"time_used",
"cpu_instr_count",
"cpu_cycles_count",
"micro_passes",
"merge_counts",
),
)
def addModuleOptimizationTimeInformation(
module_name, pass_number, time_used, micro_passes, merge_counts
module_name, pass_number, time_used, perf_counters, micro_passes, merge_counts
):
module_timing_info = list(module_timing_infos.get(module_name, []))
@@ -316,6 +323,8 @@ def addModuleOptimizationTimeInformation(
ModuleOptimizationTimingInfo(
pass_number=pass_number,
time_used=time_used,
cpu_instr_count=perf_counters[0],
cpu_cycles_count=perf_counters[1],
micro_passes=micro_passes,
merge_counts=merge_counts,
)

View File

@@ -13,6 +13,7 @@ be a "in (str, unicode)" rather than making useless version checks.
import pkgutil
import sys
import time
from hashlib import md5 as _md5
# pylint: disable=invalid-name,self-assigning-variable
@@ -178,6 +179,20 @@ except ValueError:
else:
md5 = _md5
try:
perf_counter = time.perf_counter
except AttributeError:
# Python2 didn't have it yet.
perf_counter = time.time
try:
process_time = time.process_time
except AttributeError:
# Python2 didn't have it yet.
process_time = time.time
# For PyLint to be happy.
assert long
assert unicode

View File

@@ -283,7 +283,11 @@ def makeOptimizationPass():
module_name = current_module.getFullName()
with TimerReport(
message="Optimizing %s" % module_name, decider=False
message="Optimizing %s" % module_name,
logger=optimization_logger,
decider=False,
include_sleep_time=False,
use_perf_counters=current_module.isCompiledPythonModule(),
) as module_timer:
changed, micro_passes = optimizeModule(current_module)
@@ -291,6 +295,7 @@ def makeOptimizationPass():
module_name=module_name,
pass_number=pass_count,
time_used=module_timer.getDelta(),
perf_counters=module_timer.getPerfCounters(),
micro_passes=micro_passes,
merge_counts=fetchMergeCounts(),
)

View File

@@ -429,6 +429,11 @@ def _addModulesToReport(root, report_input_data, diffable):
timing_xml_node.attrib["max_branch_merge"] = str(max_merge_size)
timing_xml_node.attrib["merged_total"] = str(merged_total)
if timing_info.cpu_instr_count is not None:
timing_xml_node.attrib["cpu_instr"] = str(timing_info.cpu_instr_count)
if timing_info.cpu_cycles_count is not None:
timing_xml_node.attrib["cpu_cycles"] = str(timing_info.cpu_cycles_count)
module_xml_node.append(timing_xml_node)
distributions = report_input_data["module_distribution_usages"][module_name]

View File

@@ -37,7 +37,7 @@ from nuitka.utils.Execution import (
)
from nuitka.utils.FileOperations import deleteFile
from nuitka.utils.Importing import getExtensionModuleSuffix
from nuitka.utils.Timing import StopWatch
from nuitka.utils.Timing import StopWatchWallClock
from nuitka.utils.Utils import isMacOS
@@ -77,7 +77,7 @@ def checkNoPermissionError(output):
def _getCPythonResults(cpython_cmd, send_kill):
stop_watch = StopWatch()
stop_watch = StopWatchWallClock()
# Try a compile of times for permission denied, on Windows it can
# be transient.
@@ -640,7 +640,7 @@ Taking coverage of '{filename}' using '{python}' with flags {args} ...""".format
if trace_command:
my_print("Going to output directory", os.getcwd())
stop_watch = StopWatch()
stop_watch = StopWatchWallClock()
stop_watch.start()
if not two_step_execution:

212
nuitka/utils/Profiling.py Normal file
View File

@@ -0,0 +1,212 @@
# Copyright 2025, Kay Hayen, mailto:kay.hayen@gmail.com find license text at end of file
""" Tools for profiling.
Currently using perf kernel interface for native Linux are the only thing exposed,
they don't work inside of containers though, or with default security settings.
Execute that setting permanent weaker:
echo 'kernel.perf_event_paranoid = 1' | sudo tee /etc/sysctl.d/99-nuitka-perf.conf
sudo sysctl --system
Temporary:
sudo sh -c 'echo 1 > /proc/sys/kernel/perf_event_paranoid'
"""
from .Utils import isLinux
# isort: start
if not isLinux():
# TODO: Windows and MacOS support need other solutions.
def hasPerfProfilingSupport():
return False
PerfCounters = None
else:
import ctypes
import ctypes.util
import fcntl # Linux only, pylint: disable=I0021,import-error
import os
import platform
import struct
def hasPerfProfilingSupport():
return isLinux() and getPerfFileHandle(PERF_COUNT_HW_INSTRUCTIONS) is not None
# The syscall number for perf_event_open varies by architecture.
ARCH_MAP = {
"x86_64": 298,
"i386": 336,
"aarch64": 241,
"armv7l": 364,
}
def getPerfFileHandle(config, group_fd=-1, read_format=0):
SYS_perf_event_open = ARCH_MAP.get(platform.machine())
if SYS_perf_event_open is None:
# This is not a hard error, we just cannot do perf.
return None
# --- Load libc and define syscall function ---
libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
syscall = libc.syscall
syscall.restype = ctypes.c_long
attr = PerfEventAttr(
type=PERF_TYPE_HARDWARE,
size=ctypes.sizeof(PerfEventAttr),
config=config,
read_format=read_format,
# flags for perf_event_open syscall:
# Bit 0: disabled -> if set start in disabled state
# Bit 17: exclude_kernel -> if set only count user space
# Bit 18: exclude_hv -> if set exclude hypervisor
flags=(1 << 0) | (1 << 17) | (1 << 18),
)
# We must explicitly cast every argument to its correct 64-bit C type to
# satisfy the variadic nature of the syscall ABI.
fd = syscall(
ctypes.c_long(SYS_perf_event_open),
ctypes.byref(attr),
ctypes.c_long(0), # pid
ctypes.c_long(-1), # cpu
ctypes.c_long(group_fd),
ctypes.c_long(0), # flags
)
if fd < 0:
return None
return fd
# See 'man perf_event_open' for details
PERF_TYPE_HARDWARE = 0
PERF_COUNT_HW_REF_CPU_CYCLES = 9
PERF_COUNT_HW_INSTRUCTIONS = 1
class _PerfEventAttrUnion(ctypes.Union):
_fields_ = [
("sample_period", ctypes.c_uint64),
("sample_freq", ctypes.c_uint64),
]
class PerfEventAttr(ctypes.Structure):
# This structure must match the C structure, otherwise the kernel will
# reject it with E2BIG, because the size field will not be what it
# expects. spell-checker: ignore clockid,regs
_fields_ = [
("type", ctypes.c_uint32),
("size", ctypes.c_uint32),
("config", ctypes.c_uint64),
("read_format", ctypes.c_uint64),
("_union1", _PerfEventAttrUnion),
("sample_type", ctypes.c_uint64),
("read_format", ctypes.c_uint64),
("flags", ctypes.c_uint64),
# More fields for the union
("wakeup_events", ctypes.c_uint32),
("wakeup_watermark", ctypes.c_uint32),
("bp_type", ctypes.c_uint32),
("bp_addr", ctypes.c_uint64),
("bp_len", ctypes.c_uint64),
("branch_sample_type", ctypes.c_uint64),
("sample_regs_user", ctypes.c_uint64),
("sample_stack_user", ctypes.c_uint32),
("clockid", ctypes.c_int32),
("sample_regs_intr", ctypes.c_uint64),
("aux_watermark", ctypes.c_uint32),
("sample_max_stack", ctypes.c_uint16),
("__reserved_2", ctypes.c_uint16),
("aux_sample_size", ctypes.c_uint32),
("__reserved_3", ctypes.c_uint32),
("sig_data", ctypes.c_uint64),
]
# --- Define ioctl magic numbers ---
# These are from <linux/perf_event.h>
PERF_EVENT_IOC_ENABLE = 0x2400
PERF_EVENT_IOC_DISABLE = 0x2401
PERF_EVENT_IOC_RESET = 0x2403
class PerfCounter(object):
"""
A simple in-process wrapper for a single perf counter.
"""
def __init__(self, config, group_fd=-1, read_format=0):
self.fd = getPerfFileHandle(
config, group_fd=group_fd, read_format=read_format
)
assert self.fd is not None
def reset(self):
fcntl.ioctl(self.fd, PERF_EVENT_IOC_RESET, 0)
def start(self):
self.reset()
fcntl.ioctl(self.fd, PERF_EVENT_IOC_ENABLE, 0)
def stop(self):
try:
fcntl.ioctl(self.fd, PERF_EVENT_IOC_DISABLE, 0)
except OSError:
pass
def read(self):
# Reads the 8-byte (64-bit) counter value
count_bytes = os.read(self.fd, 8)
return struct.unpack("Q", count_bytes)[0]
def close(self):
os.close(self.fd)
class PerfCounters(object):
def __init__(self):
# Create the instruction counter as the group leader.
self.instr_counter = PerfCounter(config=PERF_COUNT_HW_INSTRUCTIONS)
# Create the cycle counter as a member of the same group.
self.cycle_counter = PerfCounter(
config=PERF_COUNT_HW_REF_CPU_CYCLES, group_fd=self.instr_counter.fd
)
def start(self):
# Enabling the group leader enables all counters in the group.
self.instr_counter.start()
def stop(self):
# Disabling the group leader disables all counters in the group.
self.instr_counter.stop()
def getValues(self):
instr_count = self.instr_counter.read()
cycle_count = self.cycle_counter.read()
self.instr_counter.close()
self.cycle_counter.close()
return instr_count, cycle_count
# Part of "Nuitka", an optimizing Python compiler that is compatible and
# integrates with CPython, but also works on its own.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@@ -8,27 +8,40 @@ call an external tool.
"""
from contextlib import contextmanager
from timeit import default_timer as timer
from nuitka.__past__ import StringIO
from nuitka.__past__ import StringIO, perf_counter, process_time
from nuitka.Tracing import general
from .Profiling import PerfCounters, hasPerfProfilingSupport
class StopWatch(object):
__slots__ = ("start_time", "end_time")
has_perf_counters = hasPerfProfilingSupport()
def __init__(self):
class StopWatchWallClockBase(object):
__slots__ = ("start_time", "end_time", "perf_counters")
# For overload, pylint: disable=not-callable
timer = None
def __init__(self, use_perf_counters=False):
self.start_time = None
self.end_time = None
self.perf_counters = PerfCounters() if use_perf_counters else None
def start(self):
self.start_time = timer()
if self.perf_counters is not None:
self.perf_counters.start()
self.start_time = self.timer()
def restart(self):
self.start()
def end(self):
self.end_time = timer()
self.end_time = self.timer()
if self.perf_counters is not None:
self.perf_counters.stop()
stop = end
@@ -36,7 +49,21 @@ class StopWatch(object):
if self.end_time is not None:
return self.end_time - self.start_time
else:
return timer() - self.start_time
return self.timer() - self.start_time
def getPerfCounters(self):
if self.perf_counters is not None:
return self.perf_counters.getValues()
else:
return None, None
class StopWatchWallClock(StopWatchWallClockBase):
timer = perf_counter
class StopWatchProcessClock(StopWatchWallClockBase):
timer = process_time
class TimerReport(object):
@@ -45,9 +72,25 @@ class TimerReport(object):
Mostly intended as a wrapper for external process calls.
"""
__slots__ = ("message", "decider", "logger", "timer", "min_report_time")
__slots__ = (
"message",
"decider",
"logger",
"timer",
"min_report_time",
"include_sleep_time",
"use_perf_counters",
)
def __init__(self, message, logger=None, decider=True, min_report_time=None):
def __init__(
self,
message,
logger=None,
decider=True,
min_report_time=None,
include_sleep_time=True,
use_perf_counters=None,
):
self.message = message
# Shortcuts.
@@ -64,12 +107,23 @@ class TimerReport(object):
self.min_report_time = min_report_time
self.timer = None
self.include_sleep_time = include_sleep_time
if use_perf_counters is None:
use_perf_counters = not self.include_sleep_time
# They might not be allowed.
self.use_perf_counters = use_perf_counters and has_perf_counters
def getTimer(self):
return self.timer
def __enter__(self):
self.timer = StopWatch()
stop_stop_class = (
StopWatchWallClock if self.include_sleep_time else StopWatchProcessClock
)
self.timer = stop_stop_class(self.use_perf_counters)
self.timer.start()
return self.timer
@@ -96,7 +150,7 @@ def withProfiling(name, logger, enabled):
from nuitka.Options import getOutputPath
pr = cProfile.Profile()
pr = cProfile.Profile(timer=process_time)
pr.enable()
yield