python · brandtbucher · Oct 4, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
@@ -86,10 +86,6 @@ typedef struct _object_stats {
  uint64_t type_cache_dunder_hits;
  uint64_t type_cache_dunder_misses;
  uint64_t type_cache_collisions;
- uint64_t optimization_attempts;
- uint64_t optimization_traces_created;
- uint64_t optimization_traces_executed;
- uint64_t optimization_uops_executed;
  /* Temporary value used during GC */
  uint64_t object_visits;
 } ObjectStats;
@@ -100,10 +96,34 @@ typedef struct _gc_stats {
  uint64_t objects_collected;
 } GCStats;
 
+typedef struct _uop_stats {
+ uint64_t execution_count;
+} UOpStats;
+
+#define _Py_UOP_HIST_SIZE 5
+
+typedef struct _optimization_stats {
+ uint64_t attempts;
+ uint64_t traces_created;
+ uint64_t traces_executed;
+ uint64_t uops_executed;
+ uint64_t trace_stack_overflow;
+ uint64_t trace_stack_underflow;
+ uint64_t trace_too_long;
+ uint64_t inner_loop;
+ uint64_t recursive_call;
+ UOpStats opcode[512];
+ uint64_t unsupported_opcode[256];
+ uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
+ uint64_t trace_run_length_hist[_Py_UOP_HIST_SIZE];
+ uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE];
+} OptimizationStats;
+
 typedef struct _stats {
  OpcodeStats opcode_stats[256];
  CallStats call_stats;
  ObjectStats object_stats;
+ OptimizationStats optimization_stats;
  GCStats *gc_stats;
 } PyStats;
 

diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
@@ -282,6 +282,10 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \
  do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0)
 #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
+#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
+#define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
+#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
+#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> 2)]++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -296,6 +300,10 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define EVAL_CALL_STAT_INC(name) ((void)0)
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
 #define GC_STAT_ADD(gen, name, n) ((void)0)
+#define OPT_STAT_INC(name) ((void)0)
+#define UOP_EXE_INC(opname) ((void)0)
+#define OPT_UNSUPPORTED_OPCODE(opname) ((void)0)
+#define OPT_HIST(length, name) ((void)0)
 #endif // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.

@@ -2244,7 +2244,7 @@ dummy_func(
  // Double-check that the opcode isn't instrumented or something:
  here->op.code == JUMP_BACKWARD)
  {
- OBJECT_STAT_INC(optimization_attempts);
+ OPT_STAT_INC(attempts);
  int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer);
  ERROR_IF(optimized < 0, error);
  if (optimized) {
@@ -3913,6 +3913,7 @@ dummy_func(
  frame->prev_instr--; // Back up to just before destination
  _PyFrame_SetStackPointer(frame, stack_pointer);
  Py_DECREF(self);
+ OPT_HIST(pc, trace_run_length_hist);
  return frame;
  }
 

diff --git a/Python/executor.c b/Python/executor.c
@@ -2,6 +2,7 @@
 
 #include "opcode.h"
 
+#include "pycore_bitutils.h"
 #include "pycore_call.h"
 #include "pycore_ceval.h"
 #include "pycore_dict.h"
@@ -30,6 +31,16 @@
  goto deoptimize; \
  }
 
+#ifdef Py_STATS
+// Disable these macros that apply to Tier 1 stats when we are in Tier 2
+#undef STAT_INC
+#define STAT_INC(opname, name) ((void)0)
+#undef STAT_DEC
+#define STAT_DEC(opname, name) ((void)0)
+#undef CALL_STAT_INC
+#define CALL_STAT_INC(name) ((void)0)
+#endif
+
 #undef ENABLE_SPECIALIZATION
 #define ENABLE_SPECIALIZATION 0
 
@@ -62,7 +73,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
 
  CHECK_EVAL_BREAKER();
 
- OBJECT_STAT_INC(optimization_traces_executed);
+ OPT_STAT_INC(traces_executed);
  _Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;
  int pc = 0;
  int opcode;
@@ -81,7 +92,8 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
  operand,
  (int)(stack_pointer - _PyFrame_Stackbase(frame)));
  pc++;
- OBJECT_STAT_INC(optimization_uops_executed);
+ OPT_STAT_INC(uops_executed);
+ UOP_EXE_INC(opcode);
  switch (opcode) {
 
 #include "executor_cases.c.h"
@@ -114,6 +126,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
  // On ERROR_IF we return NULL as the frame.
  // The caller recovers the frame from tstate->current_frame.
  DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
+ OPT_HIST(pc, trace_run_length_hist);
  _PyFrame_SetStackPointer(frame, stack_pointer);
  Py_DECREF(self);
  return NULL;
@@ -122,6 +135,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
  // On DEOPT_IF we just repeat the last instruction.
  // This presumes nothing was popped from the stack (nor pushed).
  DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
+ OPT_HIST(pc, trace_run_length_hist);
  frame->prev_instr--; // Back up to just before destination
  _PyFrame_SetStackPointer(frame, stack_pointer);
  Py_DECREF(self);

diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
@@ -461,6 +461,7 @@ translate_bytecode_to_trace(
  if (trace_length + (n) > max_length) { \
  DPRINTF(2, "No room for %s (need %d, got %d)\n", \
  (opname), (n), max_length - trace_length); \
+ OPT_STAT_INC(trace_too_long); \
  goto done; \
  } \
  reserved = (n); // Keep ADD_TO_TRACE / ADD_TO_STUB honest
@@ -472,6 +473,7 @@ translate_bytecode_to_trace(
 #define TRACE_STACK_PUSH() \
  if (trace_stack_depth >= TRACE_STACK_SIZE) { \
  DPRINTF(2, "Trace stack overflow\n"); \
+ OPT_STAT_INC(trace_stack_overflow); \
  ADD_TO_TRACE(_SET_IP, 0, 0); \
  goto done; \
  } \
@@ -572,6 +574,7 @@ translate_bytecode_to_trace(
  ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0);
  }
  else {
+ OPT_STAT_INC(inner_loop);
  DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n");
  }
  goto done;
@@ -638,7 +641,9 @@ translate_bytecode_to_trace(
  // LOAD_CONST + _POP_FRAME.
  if (trace_stack_depth == 0) {
  DPRINTF(2, "Trace stack underflow\n");
- goto done;}
+ OPT_STAT_INC(trace_stack_underflow);
+ goto done;
+ }
  }
  uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM
  for (int i = 0; i < nuops; i++) {
@@ -713,6 +718,7 @@ translate_bytecode_to_trace(
  PyUnicode_AsUTF8(new_code->co_qualname),
  PyUnicode_AsUTF8(new_code->co_filename),
  new_code->co_firstlineno);
+ OPT_STAT_INC(recursive_call);
  ADD_TO_TRACE(_SET_IP, 0, 0);
  goto done;
  }
@@ -744,6 +750,7 @@ translate_bytecode_to_trace(
  break;
  }
  DPRINTF(2, "Unsupported opcode %s\n", uop_name(opcode));
+ OPT_UNSUPPORTED_OPCODE(opcode);
  goto done; // Break out of loop
  } // End default
 
@@ -891,7 +898,8 @@ uop_optimize(
  // Error or nothing translated
  return trace_length;
  }
- OBJECT_STAT_INC(optimization_traces_created);
+ OPT_HIST(trace_length, trace_length_hist);
+ OPT_STAT_INC(traces_created);
  char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
  if (uop_optimize != NULL && *uop_optimize > '0') {
  trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
@@ -901,6 +909,7 @@ uop_optimize(
  if (executor == NULL) {
  return -1;
  }
+ OPT_HIST(trace_length, optimized_trace_length_hist);
  executor->base.execute = _PyUopExecute;
  memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
  *exec_ptr = (_PyExecutorObject *)executor;

diff --git a/Python/specialize.c b/Python/specialize.c
@@ -199,10 +199,6 @@ print_object_stats(FILE *out, ObjectStats *stats)
  fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions);
  fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits);
  fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses);
- fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->optimization_attempts);
- fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->optimization_traces_created);
- fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->optimization_traces_executed);
- fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->optimization_uops_executed);
 }
 
 static void
@@ -215,13 +211,63 @@ print_gc_stats(FILE *out, GCStats *stats)
  }
 }
 
+static void
+print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE])
+{
+ for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) {
+ fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + 2)), hist[i]);
+ }
+}
+
+static void
+print_optimization_stats(FILE *out, OptimizationStats *stats)
+{
+ fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts);
+ fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created);
+ fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed);
+ fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed);
+ fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow);
+ fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow);
+ fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long);
+ fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop);
+ fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call);
+
+ print_histogram(out, "Trace length", stats->trace_length_hist);
+ print_histogram(out, "Trace run length", stats->trace_run_length_hist);
+ print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist);
+
+ const char* const* names;
+ for (int i = 0; i < 512; i++) {
+ if (i < 256) {
+ names = _PyOpcode_OpName;
+ } else {
+ names = _PyOpcode_uop_name;
+ }
+ if (stats->opcode[i].execution_count) {
+ fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", names[i], stats->opcode[i].execution_count);
+ }
+ }
+
+ for (int i = 0; i < 256; i++) {
+ if (stats->unsupported_opcode[i]) {
+ fprintf(
+ out, 
+ "unsupported_opcode[%s].count : %" PRIu64 "\n", 
+ _PyOpcode_OpName[i], 
+ stats->unsupported_opcode[i]
+ );
+ }
+ }
+}
+
 static void
 print_stats(FILE *out, PyStats *stats)
 {
  print_spec_stats(out, stats->opcode_stats);
  print_call_stats(out, &stats->call_stats);
  print_object_stats(out, &stats->object_stats);
  print_gc_stats(out, stats->gc_stats);
+ print_optimization_stats(out, &stats->optimization_stats);
 }
 
 void