Skip to content

Commit

Permalink
add uop splitting for tile loads and stores
Browse files Browse the repository at this point in the history
remove dead code from old idea for above
clean up unused code
still need to add stat events
  • Loading branch information
Michael Allen Goldstein committed Mar 6, 2024
1 parent f8e2fd2 commit d680a9e
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 166 deletions.
25 changes: 0 additions & 25 deletions src/trace_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,6 @@ POSSIBILITY OF SUCH DAMAGE.
/// read/write required size only.
///////////////////////////////////////////////////////////////////////////////////////////////

typedef struct Tile_info {
uint8_t palette; // selects the supported configuration of the tiles that will be used
uint8_t start_row; // used for storing the restart values for interrupted operations
uint8_t buf0[14]; // reserved, must be 0
uint16_t tile0_colsb; // Tile 0 bytes per row
uint16_t tile1_colsb; // Tile 1 bytes per row
uint16_t tile2_colsb; // Tile 2 bytes per row
uint16_t tile3_colsb; // Tile 3 bytes per row
uint16_t tile4_colsb; // Tile 4 bytes per row
uint16_t tile5_colsb; // Tile 5 bytes per row
uint16_t tile6_colsb; // Tile 6 bytes per row
uint16_t tile7_colsb; // Tile 7 bytes per row
uint8_t buf1[14]; // reserved, must be 0
uint8_t tile0_rows; // Tile 0 rows
uint8_t tile1_rows; // Tile 1 rows
uint8_t tile2_rows; // Tile 2 rows
uint8_t tile3_rows; // Tile 3 rows
uint8_t tile4_rows; // Tile 4 rows
uint8_t tile5_rows; // Tile 5 rows
uint8_t tile6_rows; // Tile 6 rows
uint8_t tile7_rows; // Tile 7 rows
uint8_t buf2[8]; // reserved, must be 0
} tile_info_t;

typedef struct trace_info_s {
trace_info_s();
virtual ~trace_info_s();
Expand All @@ -111,7 +87,6 @@ typedef struct trace_info_cpu_s {
bool m_write_flg; /**< write flag */
uint8_t m_num_ld; /**< number of load operations */
uint8_t m_size; /**< instruction size */
tile_info_t m_tile_info; /** info regarding tile configuration */
// dynamic information
uint64_t m_ld_vaddr1; /**< load address 1 */
uint64_t m_ld_vaddr2; /**< load address 2 */
Expand Down
121 changes: 97 additions & 24 deletions src/trace_read_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ void cpu_decoder_c::convert_dyn_uop(inst_info_s *info, void *trace_info,
// TODO: create STAT_EVENT_N calls for this
}
}
// TODO: handle tileconfig? maybe not

// next pc
trace_uop->m_npc = trace_uop->m_addr;
Expand Down Expand Up @@ -901,37 +900,111 @@ inst_info_s *cpu_decoder_c::convert_pinuop_to_t_uop(void *trace_info,
if (pi->m_opcode == XED_CATEGORY_AMX_TILE) {
// handle AMX tile instructions
bool is_amx_mem = (pi->m_has_st) || (pi->m_num_ld > 0);
bool is_amx_config = false; // TODO: add way to read this and confirm it
// bool is_amx_config = (pi->m_num_ld == 1) && (pi->m_tile_info.palette != 0); // questionable...
dyn_uop_counter = 1;

if (is_amx_mem) {
if (is_amx_config) {
// load config data regarding tiles
int rep_counter = 1;
int rep_dir = 0;
bool tileload_type = false; // false means store, true means load

if (pi->m_has_st) {
trace_uop[0]->m_mem_type = MEM_ST;
DEBUG_CORE(
core_id,
"AMX_TILE_MEM core_id:%d thread_id:%d pc:0x%llx opcode:%d"
"mem_write_size:%d dyn_uop_counter:%d \n",
core_id, sim_thread_id, (Addr)(pi->m_instruction_addr),
static_cast<int>(pi->m_opcode), pi->m_mem_write_size,
dyn_uop_counter
);
// TODO: create stat event for stores (always 16 stores of 64 bytes)
} else {
int tileload_type = true;
trace_uop[0]->m_mem_type = MEM_LD;
trace_uop[0]->m_mem_size = pi->m_mem_read_size;
DEBUG_CORE(
core_id,
"AMX_TILE_MEM core_id:%d thread_id:%d pc:0x%llx opcode:%d"
"mem_read_size:%d dyn_uop_counter:%d \n",
core_id, sim_thread_id, (Addr)(pi->m_instruction_addr),
static_cast<int>(pi->m_opcode), pi->m_mem_read_size,
dyn_uop_counter
);
// TODO: create stat event for tile loads (always 16 stores of 64 bytes)
ASSERT(pi->m_num_ld > 0 && "invalid number of loads");
}
// generate 1 load uop for each of the 16 rows for a tile
// TODO: test this when servers are working properly
int num_tile_uops = pi->m_num_ld; // 16

key_addr = (pi->m_instruction_addr << 3);
info = htable->hash_table_access_create(key_addr, &new_entry);
ASSERT(!new_entry);
info->m_trace_info.m_bom = true;
if (tileload_type) {
info->m_table_info->m_mem_type = MEM_LD;
} else {
int rep_counter = 1;
int rep_dir = 0;
int tileload_type = -1;
info->m_table_info->m_mem_type = MEM_ST;
}

if (pi->m_has_st) {
trace_uop[0]->m_mem_type = MEM_ST;
for (jj = dyn_uop_counter; jj < num_tile_uops; jj++) {
if (tileload_type) {
trace_uop[jj]->m_mem_type = MEM_LD;
} else {
trace_uop[0]->m_mem_type = MEM_LD;
trace_uop[0]->m_mem_size = pi->m_mem_read_size;
DEBUG_CORE(
core_id,
"AMX_TILE_MEM core_id:%d thread_id:%d pc:0x%llx opcode:%d"
"mem_read_size:%d dyn_uop_counter:%d \n",
core_id, sim_thread_id, (Addr)(pi->m_instruction_addr),
static_cast<int>(pi->m_opcode), pi->m_mem_read_size,
dyn_uop_counter
);
ASSERT(pi->m_num_ld > 0 && "invalid number of loads");
trace_uop[jj]->m_mem_type = MEM_ST;
}

if (jj == 0) {
info->m_trace_info.m_bom = true;
}
int stride = pi->m_ld_vaddr2;
int rep_offset = jj * stride;

trace_uop[jj - 1]->m_npc = pi->m_instruction_addr;

key_addr = (pi->m_instruction_addr << 5) + jj;
info = htable->hash_table_access_create(key_addr, &new_entry);

if (tileload_type) {
info->m_table_info->m_mem_type = MEM_LD;
} else {
info->m_table_info->m_mem_type = MEM_ST;
}

if (!(jj == 0 && ii == 0)) {
info->m_trace_info.m_bom = false;
}
info->m_trace_info.m_eom = false;

DEBUG_CORE(
core_id,
"AMX_TILE rep_offset:%d mem_read_size: %d jj: %d\n",
rep_offset, pi->m_mem_read_size, jj
);

if (tileload_type) {
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_read_size;
} else {
trace_uop[dyn_uop_counter]->m_mem_size = pi->m_mem_write_size;
}

convert_dyn_uop(info, pi, trace_uop[dyn_uop_counter], rep_offset, core_id);

trace_uop[dyn_uop_counter]->m_info = info;
trace_uop[dyn_uop_counter]->m_eom = 0;
trace_uop[dyn_uop_counter]->m_addr = pi->m_instruction_addr;

DEBUG_CORE(
core_id,
"AMX_TILE_MEM core_id:%d thread_id:%d pc:0x%llx opcode:%d mem_read_size:%d dyn_uop_counter:%d ii:%d\n",
core_id, sim_thread_id, (Addr)(pi->m_instruction_addr),
static_cast<int>(pi->m_opcode),
pi->m_mem_read_size, dyn_uop_counter, jj
);
dyn_uop_counter++;
}
} // is_amx_mem

// TODO: generate multiple uops for different memory addresses
// (1 uop per row loading num_bytes in the row?)
} // XED_CATEGORY_AMX_TILE

ASSERT(dyn_uop_counter);
Expand Down Expand Up @@ -1256,7 +1329,7 @@ bool cpu_decoder_c::get_uops_from_traces(int core_id, uop_c *uop,

ASSERTM(
temp_num_req > 0,
"pc:%llx vaddr:%llx opcode:%d size:%d max:%d num:%d type:%d num:%d\n",
"pc:%llx vaddr:%llx opcode:%d mem_size:%d max:%d num:%d mem_type:%d num:%d\n",
uop->m_pc, uop->m_vaddr, uop->m_opcode, uop->m_mem_size,
(int)*KNOB(KNOB_MAX_TRANSACTION_SIZE), temp_num_req, uop->m_mem_type,
trace_uop->m_info->m_trace_info.m_num_uop);
Expand Down
116 changes: 25 additions & 91 deletions tools/x86_trace_generator/trace_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ CONTROL_MANAGER control;
// AMX Handling
////////////////////////////////////////////////////////////////////////////////////////////////////////

VOID AMXLoad(REG reg, ADDRINT *addr, UINT32 dst, THREADID tid) {
VOID AMXLoad(ADDRINT *addr, UINT32 stride, UINT32 mem_read_size, THREADID tid) {
// check thread is not a dummy and is being instrumented
tid = threadMap[tid];
THREAD_ENABLE_CHECK(tid);
Expand All @@ -212,10 +212,11 @@ VOID AMXLoad(REG reg, ADDRINT *addr, UINT32 dst, THREADID tid) {
return;
}
tr_info->vaddr1 = *addr;
tr_info->mem_read_size = 1024; // TODO: figure out how to get real size from tileconfig
tr_info->mem_read_size = mem_read_size;
tr_info->vaddr2 = static_cast<ADDRINT>(stride);
}

VOID AMXStore(REG reg, ADDRINT *addr, UINT32 src, THREADID tid) {
VOID AMXStore(ADDRINT *addr, UINT32 stride, UINT32 mem_st_size, THREADID tid) {
// check thread is not a dummy and is being instrumented
tid = threadMap[tid];
THREAD_ENABLE_CHECK(tid);
Expand All @@ -224,8 +225,9 @@ VOID AMXStore(REG reg, ADDRINT *addr, UINT32 src, THREADID tid) {
if (tr_info == nullptr || !PIN_IsAmxActive(tid)){
return;
}
tr_info->st_vaddr= *addr;
tr_info->mem_write_size = 1024;
tr_info->st_vaddr = *addr;
tr_info->vaddr2 = static_cast<ADDRINT>(stride);
tr_info->mem_write_size = mem_st_size;
}

VOID AMXZero(UINT32 dst, THREADID tid) {
Expand All @@ -243,75 +245,6 @@ VOID AMXGEMM(UINT32 dst, UINT32 a, UINT32 b, THREADID tid) {
}
}

/*
layout:
Bytes | field | description
0 palette selects the supported configuration of the tiles that will be used
1 start_row used for storing the restart values for interrupted operations
2-15 reserved, must be 0
16-17 tile0.colsb Tile 0 bytes per row
18-19 tile1.colsb Tile 1 bytes per row
20-21 tile2.colsb Tile 2 bytes per row
22-23 tile3.colsb Tile 3 bytes per row
24-25 tile4.colsb Tile 4 bytes per row
26-27 tile5.colsb Tile 5 bytes per row
28-29 tile6.colsb Tile 6 bytes per row
30-31 tile7.colsb Tile 7 bytes per row
32-47 reserved, must be 0
48 tile0.rows Tile 0 rows
49 tile1.rows Tile 1 rows
50 tile2.rows Tile 2 rows
51 tile3.rows Tile 3 rows
52 tile4.rows Tile 4 rows
53 tile5.rows Tile 5 rows
54 tile6.rows Tile 6 rows
55 tile7.rows Tile 7 rows
56-63 reserved, must be 0
*/
tile_info_t::Tile_info(void) {
this->palette = 0;
this->start_row = 0;
for (int i = 0; i < 14; i++) {
this->buf0[i] = 0;
this->buf1[i] = 0;
if (i < 8) {
this->buf2[i] = 0;
}
}
this->tile0_colsb = 0;
this->tile1_colsb = 0;
this->tile2_colsb = 0;
this->tile3_colsb = 0;
this->tile4_colsb = 0;
this->tile5_colsb = 0;
this->tile6_colsb = 0;
this->tile7_colsb = 0;
this->tile0_rows = 0;
this->tile1_rows = 0;
this->tile2_rows = 0;
this->tile3_rows = 0;
this->tile4_rows = 0;
this->tile5_rows = 0;
this->tile6_rows = 0;
this->tile7_rows = 0;
}

tile_info_t t_info;
VOID AMXConfig(ADDRINT *addr, THREADID tid) {
// TODO: figure out how to get dynamic info from this (rows, row size, etc)
Trace_info *tr_info = trace_info_array[tid];
if (tr_info == nullptr || !PIN_IsAmxActive(tid)) {
return;
}
PIN_SafeCopy(&t_info, addr, 64);

// handle data
tr_info->inst_info.tile_info = t_info;
// load info
tr_info->vaddr1 = *addr;
tr_info->mem_read_size = 64;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////
// control handler for pinpoint (simpoint)
////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -1024,11 +957,12 @@ void instrument(INS ins)
// ----------------------------------------
if (INS_Category(ins) == XED_CATEGORY_AMX_TILE) {
if (INS_Mnemonic(ins) == "TILELOADD") {
info->num_ld = 64; // TODO: figure out how to uncap this -- it needs to be the size of the config region (at most 1024), but it capped at 64
// current solution: break into multiple uops of size 64 (how to set smaller load sizes? tileconfig?)
info->num_ld = 16; // just assume 16 and break into 1024/16 load uops (one load per row)

REG r = INS_OperandReg(ins, 0);
UINT32 dst = r - REG_TMM0;
if (!REG_is_tmm(r)){
cout << "opd 1 is not a tile register" << endl;
}
#ifdef VERBOSE
REG base_reg = INS_OperandMemoryBaseReg(ins, 1);
REG index_reg = INS_OperandMemoryIndexReg(ins, 1);
Expand All @@ -1037,9 +971,9 @@ void instrument(INS ins)
INS_InsertCall(
ins,
IPOINT_BEFORE, AFUNPTR(AMXLoad),
IARG_UINT32, REG(INS_OperandReg(ins, 0)),
IARG_MEMORYOP_PTR, 0,
IARG_UINT32, dst,
IARG_UINT32, 64, // assume max size
IARG_UINT32, 64,
IARG_THREAD_ID,
IARG_END
);
Expand Down Expand Up @@ -1091,11 +1025,11 @@ void instrument(INS ins)
);
} else if (INS_Mnemonic(ins) == "TILESTORED") {
info->has_st = 1;
// info->num_ld = 16;
REG r = INS_OperandReg(ins, 1);
if (!REG_is_tmm(r)){
cout << "opd 1 is not a tile register" << endl;
}
UINT32 src = r - REG_TMM0;
#ifdef VERBOSE
REG base_reg = INS_OperandMemoryBaseReg(ins, 0);
REG index_reg = INS_OperandMemoryIndexReg(ins, 0);
Expand All @@ -1104,9 +1038,9 @@ void instrument(INS ins)
INS_InsertCall(
ins,
IPOINT_BEFORE, AFUNPTR(AMXStore),
IARG_UINT32, REG(INS_OperandReg(ins, 1)),
IARG_MEMORYOP_EA, 0,
IARG_UINT32, src,
IARG_UINT32, 64, // assuming max size
IARG_UINT32, 64,
IARG_THREAD_ID,
IARG_END
);
Expand All @@ -1117,19 +1051,19 @@ void instrument(INS ins)
cout << "ldtilecfg" /*[" << REG_StringShort(base_reg) << "+" << REG_StringShort(index_reg) << "]"*/ << endl;
#endif
// send memory address to copy config data from
info->num_ld = 1;
INS_InsertCall(
ins,
IPOINT_BEFORE, AFUNPTR(AMXConfig),
IARG_MEMORYOP_PTR, 0,
IARG_THREAD_ID,
IARG_END
);
// info->num_ld = 1;
// INS_InsertCall(
// ins,
// IPOINT_BEFORE, AFUNPTR(AMXConfig),
// IARG_MEMORYOP_PTR, 0,
// IARG_THREAD_ID,
// IARG_END
// );
} else if (INS_Mnemonic(ins) == "TILERELEASE") {
#ifdef VERBOSE
cout << "tilerelease" << endl;
#endif
memset((void *)&t_info, 0, sizeof(tile_info_t));
// memset((void *)&t_info, 0, sizeof(tile_info_t));
} else {
cerr << "Unsupported AMX instruction: " << INS_Mnemonic(ins) << endl;
exit(-1);
Expand Down
Loading

0 comments on commit d680a9e

Please sign in to comment.