Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 160 additions & 15 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@

#include <atomic>
#include <ctime>
#include <iostream>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do not need these additional headers now?

Copy link
Author

@OliverRietmann OliverRietmann Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed <iostream> and <format>. I left <string> because it's used in std::string nEventReport;.

#include <format>
#include <string>

#ifndef _WIN32
#include <unistd.h>
Expand Down Expand Up @@ -212,6 +215,89 @@ int32_t GPUReconstructionCPU::ExitDevice()
return 0;
}

namespace
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose we try to move additional functions to GPUReconstructionDebug.cxx, in order not to blow up GPUReconstructionGPU.cxx with timing and debug code.

Copy link
Author

@OliverRietmann OliverRietmann Mar 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved everything to GPUReconstructionDebug.cxx with the latest commit.

{
void writeHeaderMarkdown(std::ostream& stream)
{
stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n";
stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n";
}

void writeHeaderCSV(std::ostream& stream)
{
stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n";
}

struct Row {
char type = ' ';
std::string name;
uint32_t count = 0;
double gpu_time = -1.0;
double cpu_time = -1.0;
double total_time = -1.0;
uint32_t memSize = 0;
uint32_t statNEvents;

void writeMarkdown(std::ostream& stream)
{
double scale = 1000000.0 / statNEvents;
stream << "| " << type << " | ";
if (count != 0)
stream << std::format("{:6} |", count);
else
stream << " |";
stream << std::format(" {:42}|", name);
if (gpu_time != -1.0)
stream << std::format("{:10.0f} |", gpu_time * scale);
else
stream << " |";
if (cpu_time != -1.0)
stream << std::format("{:10.0f} |", cpu_time * scale);
else
stream << " |";
if (cpu_time != -1.0 && total_time != -1.0)
stream << std::format("{:8.2f} |", cpu_time / total_time);
else
stream << " |";
if (total_time != -1.0)
stream << std::format("{:10.0f} |", total_time * scale);
else
stream << " |";
if (memSize != 0 && count != 0)
stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count);
else
stream << " | | |";
stream << std::endl;
}

void writeCSV(std::ostream& stream)
{
double scale = 1000000.0 / statNEvents;
stream << type << ",";
if (count != 0)
stream << count;
stream << "," << name << ",";
if (gpu_time != -1.0)
stream << std::format("{:.0f}", gpu_time * scale);
stream << ",";
if (cpu_time != -1.0)
stream << std::format("{:.0f}", cpu_time * scale);
stream << ",";
if (cpu_time != -1.0 && total_time != -1.0)
stream << std::format("{:.2f}", cpu_time / total_time);
stream << ",";
if (total_time != -1.0)
stream << std::format("{:.0f}", total_time * scale);
stream << ",";
if (memSize != 0 && count != 0)
stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count);
else
stream << ",,";
stream << std::endl;
}
};
} // namespace

int32_t GPUReconstructionCPU::RunChains()
{
mMemoryScalers->temporaryFactor = 1.;
Expand Down Expand Up @@ -256,15 +342,26 @@ int32_t GPUReconstructionCPU::RunChains()
PrintMemoryOverview();
}

mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mStatWallTime has a getter GetStatWallTime(), so you should update that variable.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added it again.

mStatWallTime = mTimerTotal.GetElapsedTime();
std::string nEventReport;
if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) {
nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)";
}
double kernelTotal = 0;
std::vector<double> kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.);

std::ofstream benchmarkCSV;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also here, let's try to encapsulate timing code to GPUReconstructionDebug.cxx

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if (!GetProcessingSettings().timingCSV.empty()) {
benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app);
if (!benchmarkCSV.is_open()) {
GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str());
} else if (mNEventsProcessed == 1) {
writeHeaderCSV(benchmarkCSV);
}
}

if (GetProcessingSettings().debugLevel >= 1) {
writeHeaderMarkdown(std::cout);
for (uint32_t i = 0; i < mTimers.size(); i++) {
double time = 0;
if (mTimers[i] == nullptr) {
Expand All @@ -284,11 +381,19 @@ int32_t GPUReconstructionCPU::RunChains()
int32_t stepNum = getRecoStepNum(mTimers[i]->step);
kernelStepTimes[stepNum] += time;
}
char bandwidth[256] = "";
Row task_row;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a more compact form, couldn't you initialize all the task_row members in one line with an initializer list?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the class Row entirely, according to your comment below.

task_row.type = 'K';
task_row.name = mTimers[i]->name.c_str();
task_row.gpu_time = time;
task_row.count = mTimers[i]->count;
task_row.statNEvents = mStatNEvents;
if (mTimers[i]->memSize && mStatNEvents && time != 0.) {
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
task_row.memSize = mTimers[i]->memSize;
}
if (benchmarkCSV.is_open()) {
task_row.writeCSV(benchmarkCSV);
}
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
task_row.writeMarkdown(std::cout);
if (GetProcessingSettings().resetTimers) {
mTimers[i]->count = 0;
mTimers[i]->memSize = 0;
Expand All @@ -298,16 +403,42 @@ int32_t GPUReconstructionCPU::RunChains()
if (GetProcessingSettings().recoTaskTiming) {
for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) {
if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) {
printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks",
gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime());
Row reco_step_row;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And actually, this is quite redundant. Also the benchmarkCSV.is_open()
How about instead we have a class debugTimingWriter, and then you just have one-liners here
timingWriter.write(kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), ...)
And inside you could do

if (writeCSV) {
...}
if (writeMarkdown) { 
...
} else {
 ... classic style
}

Then whatever old parsing scripts still exist, are still compatible. and it will reduce the lines of code in GPUReconstructionCPU.cxx.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. The default output should now be exactly as before.

reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)";
reco_step_row.gpu_time = kernelStepTimes[i];
reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU;
reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime();
reco_step_row.statNEvents = mStatNEvents;
if (benchmarkCSV.is_open()) {
reco_step_row.writeCSV(benchmarkCSV);
}
reco_step_row.writeMarkdown(std::cout);
}
if (mTimersRecoSteps[i].bytesToGPU) {
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents,
mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU);
Row reco_step_row;
reco_step_row.type = 'D';
reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)";
reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime();
reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU;
reco_step_row.count = mTimersRecoSteps[i].countToGPU;
reco_step_row.statNEvents = mStatNEvents;
if (benchmarkCSV.is_open()) {
reco_step_row.writeCSV(benchmarkCSV);
}
reco_step_row.writeMarkdown(std::cout);
}
if (mTimersRecoSteps[i].bytesToHost) {
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
Row reco_step_row;
reco_step_row.type = 'D';
reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)";
reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime();
reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost;
reco_step_row.count = mTimersRecoSteps[i].countToHost;
reco_step_row.statNEvents = mStatNEvents;
if (benchmarkCSV.is_open()) {
reco_step_row.writeCSV(benchmarkCSV);
}
reco_step_row.writeMarkdown(std::cout);
}
if (GetProcessingSettings().resetTimers) {
mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
Expand All @@ -321,16 +452,30 @@ int32_t GPUReconstructionCPU::RunChains()
}
for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) {
if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) {
printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents);
Row general_step_row;
general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i];
general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime();
general_step_row.statNEvents = mStatNEvents;
if (benchmarkCSV.is_open()) {
general_step_row.writeCSV(benchmarkCSV);
}
general_step_row.writeMarkdown(std::cout);
}
}
Row wall_row;
wall_row.name = "Wall";
if (GetProcessingSettings().debugLevel >= 1) {
mStatKernelTime = kernelTotal * 1000000 / mStatNEvents;
printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str());
wall_row.gpu_time = kernelTotal;
}
wall_row.cpu_time = mStatCPUTime;
wall_row.total_time = mStatWallTime;
wall_row.statNEvents = mStatNEvents;
if (benchmarkCSV.is_open()) {
wall_row.writeCSV(benchmarkCSV);
}
printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str());
wall_row.writeMarkdown(std::cout);
} else if (GetProcessingSettings().debugLevel >= 0) {
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str());
}
if (GetProcessingSettings().resetTimers) {
mStatNEvents = 0;
Expand Down
3 changes: 2 additions & 1 deletion GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,13 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent,
AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file")
AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.")
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1))
AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics")
AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression")
AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you change this here to 0, the timers will not be reset anymore, thus when debug mode is enabled in O2, timers will not be reset between time frames.
I think instead, I would leave the default at 1 here, and in standalone.cxx I would change the overriding to if (...runsInit > 0 && ...timingCSV == 0) rec->SetResetTimers(iRun < configStandalone.runsInit)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there was a bug here. The argument passed to PROCresetTimers was always overwritten by runsInit, in the lines 630 and 688 of standalone.cxx: SetResetTimers(iRun < configStandalone.runsInit);. Here, I assume that PROCresetTimers == 1 means we reset after every run, not just after the first runsInit runs. In fact, I changed the default exactly so that we don't change the behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But standalone.cxx is only for the standalone benchmark. We want to reset the timers by default for normal operation, e.g. we want to get timing per time frame. It is only for the standalone benchmark, where I want to get average values. We can gladly improve the behavior for the standalone benchmark, but the default here should remain 1 for non-standalone. That is why I proposed to have the if (...runsInit && timingCSV == 0) in standalone.cxx, so that you can write out csv files with timers reset every TF, and so that you can use resetTimers if you don't use the initRuns setting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, changed.

AddOption(resetTimers, int8_t, 0, "", 0, "Reset timers every event")
AddOption(deviceTimers, bool, true, "", 0, "Use device timers instead of host-based time measurement")
AddOption(keepAllMemory, bool, false, "", 0, "Allocate all memory on both device and host, and do not reuse")
AddOption(keepDisplayMemory, bool, false, "", 0, "Like keepAllMemory, but only for memory required for event display")
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
if (configStandalone.runs > 1) {
printf("Run %d (thread %d)\n", iteration + 1, threadId);
}
recUse->SetResetTimers(iRun < configStandalone.runsInit);
recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
if (configStandalone.outputcontrolmem) {
recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem);
}
Expand Down Expand Up @@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU
chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
}
chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
recAsync->SetResetTimers(iRun < configStandalone.runsInit);
recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers);
tmpRetVal = recAsync->RunChains();
if (tmpRetVal == 0 || tmpRetVal == 2) {
OutputStat(chainTrackingAsync, nullptr, nullptr);
Expand Down
Loading