Reports: Including CPU instr and cycle counters in timing on native Linux

* With appropriate configuration on Linux this allows to get at very precise timing configuration so we can judge even small compile time improvements correctly. We then don't need many runs to average out noise from other effects. * Don't use wall clock but process time for steps that are not doing IO like module optimization for more accurate values otherwise, it is however not very accurate still.
2025-12-14 20:35:49 +01:00 · 2025-11-17 20:54:39 +01:00
parent e0c60a3fd0
commit c2482c7a6d
9 changed files with 323 additions and 21 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -137,6 +137,7 @@
    "Anext",
    "appdirs",
    "argmap",
+    "armv7l",
    "Asend",
    "asyncgen",
    "asyncgens",
@@ -418,6 +419,7 @@
    "strrchr",
    "stubgen",
    "subclasscheck",
+    "syscall",
    "sysexit",
    "sysflag",
    "terminfo",
--- a/nuitka/BytecodeCaching.py
+++ b/nuitka/BytecodeCaching.py
@@ -51,7 +51,7 @@ def hasCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):


 # Bump this is format is changed or enhanced implementation might different ones.
-_cache_format_version = 7
+_cache_format_version = 8


 def getCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):
@@ -121,8 +121,8 @@ def getCachedImportedModuleUsageAttempts(module_name, source_code, source_ref):

    # The Json doesn't store integer keys.
    for pass_timing_info in data["timing_infos"]:
-        pass_timing_info[3] = dict(
-            (int(key), value) for (key, value) in pass_timing_info[3].items()
+        pass_timing_info[5] = dict(
+            (int(key), value) for (key, value) in pass_timing_info[5].items()
        )

    return result, data["timing_infos"]
--- a/nuitka/ModuleRegistry.py
+++ b/nuitka/ModuleRegistry.py
@@ -297,12 +297,19 @@ module_timing_infos = {}

 ModuleOptimizationTimingInfo = makeNamedtupleClass(
    "ModuleOptimizationTimingInfo",
-    ("pass_number", "time_used", "micro_passes", "merge_counts"),
+    (
+        "pass_number",
+        "time_used",
+        "cpu_instr_count",
+        "cpu_cycles_count",
+        "micro_passes",
+        "merge_counts",
+    ),
 )


 def addModuleOptimizationTimeInformation(
-    module_name, pass_number, time_used, micro_passes, merge_counts
+    module_name, pass_number, time_used, perf_counters, micro_passes, merge_counts
 ):
    module_timing_info = list(module_timing_infos.get(module_name, []))

@@ -316,6 +323,8 @@ def addModuleOptimizationTimeInformation(
        ModuleOptimizationTimingInfo(
            pass_number=pass_number,
            time_used=time_used,
+            cpu_instr_count=perf_counters[0],
+            cpu_cycles_count=perf_counters[1],
            micro_passes=micro_passes,
            merge_counts=merge_counts,
        )
--- a/nuitka/past.py
+++ b/nuitka/past.py
@@ -13,6 +13,7 @@ be a "in (str, unicode)" rather than making useless version checks.

 import pkgutil
 import sys
+import time
 from hashlib import md5 as _md5

 # pylint: disable=invalid-name,self-assigning-variable
@@ -178,6 +179,20 @@ except ValueError:
 else:
    md5 = _md5

+
+try:
+    perf_counter = time.perf_counter
+except AttributeError:
+    # Python2 didn't have it yet.
+    perf_counter = time.time
+
+try:
+    process_time = time.process_time
+except AttributeError:
+    # Python2 didn't have it yet.
+    process_time = time.time
+
+
 # For PyLint to be happy.
 assert long
 assert unicode
--- a/nuitka/optimizations/Optimization.py
+++ b/nuitka/optimizations/Optimization.py
@@ -283,7 +283,11 @@ def makeOptimizationPass():
        module_name = current_module.getFullName()

        with TimerReport(
-            message="Optimizing %s" % module_name, decider=False
+            message="Optimizing %s" % module_name,
+            logger=optimization_logger,
+            decider=False,
+            include_sleep_time=False,
+            use_perf_counters=current_module.isCompiledPythonModule(),
        ) as module_timer:
            changed, micro_passes = optimizeModule(current_module)

@@ -291,6 +295,7 @@ def makeOptimizationPass():
            module_name=module_name,
            pass_number=pass_count,
            time_used=module_timer.getDelta(),
+            perf_counters=module_timer.getPerfCounters(),
            micro_passes=micro_passes,
            merge_counts=fetchMergeCounts(),
        )
--- a/nuitka/reports/Reports.py
+++ b/nuitka/reports/Reports.py
@@ -429,6 +429,11 @@ def _addModulesToReport(root, report_input_data, diffable):
                timing_xml_node.attrib["max_branch_merge"] = str(max_merge_size)
                timing_xml_node.attrib["merged_total"] = str(merged_total)

+            if timing_info.cpu_instr_count is not None:
+                timing_xml_node.attrib["cpu_instr"] = str(timing_info.cpu_instr_count)
+            if timing_info.cpu_cycles_count is not None:
+                timing_xml_node.attrib["cpu_cycles"] = str(timing_info.cpu_cycles_count)
+
            module_xml_node.append(timing_xml_node)

        distributions = report_input_data["module_distribution_usages"][module_name]
--- a/nuitka/tools/testing/compare_with_cpython/main.py
+++ b/nuitka/tools/testing/compare_with_cpython/main.py
@@ -37,7 +37,7 @@ from nuitka.utils.Execution import (
 )
 from nuitka.utils.FileOperations import deleteFile
 from nuitka.utils.Importing import getExtensionModuleSuffix
-from nuitka.utils.Timing import StopWatch
+from nuitka.utils.Timing import StopWatchWallClock
 from nuitka.utils.Utils import isMacOS


@@ -77,7 +77,7 @@ def checkNoPermissionError(output):


 def _getCPythonResults(cpython_cmd, send_kill):
-    stop_watch = StopWatch()
+    stop_watch = StopWatchWallClock()

    # Try a compile of times for permission denied, on Windows it can
    # be transient.
@@ -640,7 +640,7 @@ Taking coverage of '{filename}' using '{python}' with flags {args} ...""".format
        if trace_command:
            my_print("Going to output directory", os.getcwd())

-    stop_watch = StopWatch()
+    stop_watch = StopWatchWallClock()
    stop_watch.start()

    if not two_step_execution:
--- a/nuitka/utils/Profiling.py
+++ b/nuitka/utils/Profiling.py
@@ -0,0 +1,212 @@
+#     Copyright 2025, Kay Hayen, mailto:kay.hayen@gmail.com find license text at end of file
+
+
+""" Tools for profiling.
+
+Currently using perf kernel interface for native Linux are the only thing exposed,
+they don't work inside of containers though, or with default security settings.
+
+Execute that setting permanent weaker:
+echo 'kernel.perf_event_paranoid = 1' | sudo tee /etc/sysctl.d/99-nuitka-perf.conf
+sudo sysctl --system
+
+Temporary:
+sudo sh -c 'echo 1 > /proc/sys/kernel/perf_event_paranoid'
+
+"""
+
+from .Utils import isLinux
+
+# isort: start
+
+if not isLinux():
+    # TODO: Windows and MacOS support need other solutions.
+
+    def hasPerfProfilingSupport():
+        return False
+
+    PerfCounters = None
+
+else:
+    import ctypes
+    import ctypes.util
+    import fcntl  # Linux only, pylint: disable=I0021,import-error
+    import os
+    import platform
+    import struct
+
+    def hasPerfProfilingSupport():
+        return isLinux() and getPerfFileHandle(PERF_COUNT_HW_INSTRUCTIONS) is not None
+
+    # The syscall number for perf_event_open varies by architecture.
+    ARCH_MAP = {
+        "x86_64": 298,
+        "i386": 336,
+        "aarch64": 241,
+        "armv7l": 364,
+    }
+
+    def getPerfFileHandle(config, group_fd=-1, read_format=0):
+        SYS_perf_event_open = ARCH_MAP.get(platform.machine())
+
+        if SYS_perf_event_open is None:
+            # This is not a hard error, we just cannot do perf.
+            return None
+
+        # --- Load libc and define syscall function ---
+        libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True)
+
+        syscall = libc.syscall
+        syscall.restype = ctypes.c_long
+
+        attr = PerfEventAttr(
+            type=PERF_TYPE_HARDWARE,
+            size=ctypes.sizeof(PerfEventAttr),
+            config=config,
+            read_format=read_format,
+            # flags for perf_event_open syscall:
+            # Bit 0: disabled -> if set start in disabled state
+            # Bit 17: exclude_kernel -> if set only count user space
+            # Bit 18: exclude_hv -> if set exclude hypervisor
+            flags=(1 << 0) | (1 << 17) | (1 << 18),
+        )
+
+        # We must explicitly cast every argument to its correct 64-bit C type to
+        # satisfy the variadic nature of the syscall ABI.
+        fd = syscall(
+            ctypes.c_long(SYS_perf_event_open),
+            ctypes.byref(attr),
+            ctypes.c_long(0),  # pid
+            ctypes.c_long(-1),  # cpu
+            ctypes.c_long(group_fd),
+            ctypes.c_long(0),  # flags
+        )
+
+        if fd < 0:
+            return None
+
+        return fd
+
+    # See 'man perf_event_open' for details
+    PERF_TYPE_HARDWARE = 0
+    PERF_COUNT_HW_REF_CPU_CYCLES = 9
+    PERF_COUNT_HW_INSTRUCTIONS = 1
+
+    class _PerfEventAttrUnion(ctypes.Union):
+        _fields_ = [
+            ("sample_period", ctypes.c_uint64),
+            ("sample_freq", ctypes.c_uint64),
+        ]
+
+    class PerfEventAttr(ctypes.Structure):
+        # This structure must match the C structure, otherwise the kernel will
+        # reject it with E2BIG, because the size field will not be what it
+        # expects. spell-checker: ignore clockid,regs
+        _fields_ = [
+            ("type", ctypes.c_uint32),
+            ("size", ctypes.c_uint32),
+            ("config", ctypes.c_uint64),
+            ("read_format", ctypes.c_uint64),
+            ("_union1", _PerfEventAttrUnion),
+            ("sample_type", ctypes.c_uint64),
+            ("read_format", ctypes.c_uint64),
+            ("flags", ctypes.c_uint64),
+            # More fields for the union
+            ("wakeup_events", ctypes.c_uint32),
+            ("wakeup_watermark", ctypes.c_uint32),
+            ("bp_type", ctypes.c_uint32),
+            ("bp_addr", ctypes.c_uint64),
+            ("bp_len", ctypes.c_uint64),
+            ("branch_sample_type", ctypes.c_uint64),
+            ("sample_regs_user", ctypes.c_uint64),
+            ("sample_stack_user", ctypes.c_uint32),
+            ("clockid", ctypes.c_int32),
+            ("sample_regs_intr", ctypes.c_uint64),
+            ("aux_watermark", ctypes.c_uint32),
+            ("sample_max_stack", ctypes.c_uint16),
+            ("__reserved_2", ctypes.c_uint16),
+            ("aux_sample_size", ctypes.c_uint32),
+            ("__reserved_3", ctypes.c_uint32),
+            ("sig_data", ctypes.c_uint64),
+        ]
+
+    # --- Define ioctl magic numbers ---
+    # These are from <linux/perf_event.h>
+    PERF_EVENT_IOC_ENABLE = 0x2400
+    PERF_EVENT_IOC_DISABLE = 0x2401
+    PERF_EVENT_IOC_RESET = 0x2403
+
+    class PerfCounter(object):
+        """
+        A simple in-process wrapper for a single perf counter.
+        """
+
+        def __init__(self, config, group_fd=-1, read_format=0):
+            self.fd = getPerfFileHandle(
+                config, group_fd=group_fd, read_format=read_format
+            )
+
+            assert self.fd is not None
+
+        def reset(self):
+            fcntl.ioctl(self.fd, PERF_EVENT_IOC_RESET, 0)
+
+        def start(self):
+            self.reset()
+            fcntl.ioctl(self.fd, PERF_EVENT_IOC_ENABLE, 0)
+
+        def stop(self):
+            try:
+                fcntl.ioctl(self.fd, PERF_EVENT_IOC_DISABLE, 0)
+            except OSError:
+                pass
+
+        def read(self):
+            # Reads the 8-byte (64-bit) counter value
+            count_bytes = os.read(self.fd, 8)
+            return struct.unpack("Q", count_bytes)[0]
+
+        def close(self):
+            os.close(self.fd)
+
+    class PerfCounters(object):
+        def __init__(self):
+            # Create the instruction counter as the group leader.
+            self.instr_counter = PerfCounter(config=PERF_COUNT_HW_INSTRUCTIONS)
+            # Create the cycle counter as a member of the same group.
+            self.cycle_counter = PerfCounter(
+                config=PERF_COUNT_HW_REF_CPU_CYCLES, group_fd=self.instr_counter.fd
+            )
+
+        def start(self):
+            # Enabling the group leader enables all counters in the group.
+            self.instr_counter.start()
+
+        def stop(self):
+            # Disabling the group leader disables all counters in the group.
+            self.instr_counter.stop()
+
+        def getValues(self):
+            instr_count = self.instr_counter.read()
+            cycle_count = self.cycle_counter.read()
+
+            self.instr_counter.close()
+            self.cycle_counter.close()
+
+            return instr_count, cycle_count
+
+
+#     Part of "Nuitka", an optimizing Python compiler that is compatible and
+#     integrates with CPython, but also works on its own.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
--- a/nuitka/utils/Timing.py
+++ b/nuitka/utils/Timing.py
@@ -8,27 +8,40 @@ call an external tool.
 """

 from contextlib import contextmanager
-from timeit import default_timer as timer

-from nuitka.__past__ import StringIO
+from nuitka.__past__ import StringIO, perf_counter, process_time
 from nuitka.Tracing import general

+from .Profiling import PerfCounters, hasPerfProfilingSupport

-class StopWatch(object):
-    __slots__ = ("start_time", "end_time")
+has_perf_counters = hasPerfProfilingSupport()

-    def __init__(self):
+
+class StopWatchWallClockBase(object):
+    __slots__ = ("start_time", "end_time", "perf_counters")
+
+    # For overload, pylint: disable=not-callable
+    timer = None
+
+    def __init__(self, use_perf_counters=False):
        self.start_time = None
        self.end_time = None

+        self.perf_counters = PerfCounters() if use_perf_counters else None
+
    def start(self):
-        self.start_time = timer()
+        if self.perf_counters is not None:
+            self.perf_counters.start()
+        self.start_time = self.timer()

    def restart(self):
        self.start()

    def end(self):
-        self.end_time = timer()
+        self.end_time = self.timer()
+
+        if self.perf_counters is not None:
+            self.perf_counters.stop()

    stop = end

@@ -36,7 +49,21 @@ class StopWatch(object):
        if self.end_time is not None:
            return self.end_time - self.start_time
        else:
-            return timer() - self.start_time
+            return self.timer() - self.start_time
+
+    def getPerfCounters(self):
+        if self.perf_counters is not None:
+            return self.perf_counters.getValues()
+        else:
+            return None, None
+
+
+class StopWatchWallClock(StopWatchWallClockBase):
+    timer = perf_counter
+
+
+class StopWatchProcessClock(StopWatchWallClockBase):
+    timer = process_time


 class TimerReport(object):
@@ -45,9 +72,25 @@ class TimerReport(object):
    Mostly intended as a wrapper for external process calls.
    """

-    __slots__ = ("message", "decider", "logger", "timer", "min_report_time")
+    __slots__ = (
+        "message",
+        "decider",
+        "logger",
+        "timer",
+        "min_report_time",
+        "include_sleep_time",
+        "use_perf_counters",
+    )

-    def __init__(self, message, logger=None, decider=True, min_report_time=None):
+    def __init__(
+        self,
+        message,
+        logger=None,
+        decider=True,
+        min_report_time=None,
+        include_sleep_time=True,
+        use_perf_counters=None,
+    ):
        self.message = message

        # Shortcuts.
@@ -64,12 +107,23 @@ class TimerReport(object):
        self.min_report_time = min_report_time

        self.timer = None
+        self.include_sleep_time = include_sleep_time
+
+        if use_perf_counters is None:
+            use_perf_counters = not self.include_sleep_time
+
+        # They might not be allowed.
+        self.use_perf_counters = use_perf_counters and has_perf_counters

    def getTimer(self):
        return self.timer

    def __enter__(self):
-        self.timer = StopWatch()
+        stop_stop_class = (
+            StopWatchWallClock if self.include_sleep_time else StopWatchProcessClock
+        )
+        self.timer = stop_stop_class(self.use_perf_counters)
+
        self.timer.start()

        return self.timer
@@ -96,7 +150,7 @@ def withProfiling(name, logger, enabled):

        from nuitka.Options import getOutputPath

-        pr = cProfile.Profile()
+        pr = cProfile.Profile(timer=process_time)
        pr.enable()

        yield