ChatGPT Understands Assembly
In this experiment we look at the performance of ChatGPT in reading GDB disassembly listing and converting it to corresponding C code. The results are impressive.
The most fascinating thing about language transformer models like OpenAI GPT-4 is how well they can generalize across knowledge domains.
In this chapter we are going to look at the task of transforming binary machine code (disassembly) into C code and then back into machine code and running tests on it to verify that the result is correct.
We are going to be using the Swedish Embedded Control Systems Toolbox as our library of choice where we will:
- Compile the test applications
- Disassemble specific functions in the compiled binary
- Pass the disassembly to ChatGPT and ask it to convert it back into C code
- Compile the generated C code
- Run tests
Hopefully, ChatGPT can transform binary code back into C code. We will see how this works out.
Contextual Landscape
Machine learning has shown great ability in the area of extracting the underlying meaning from text. GPT based models have been able to describe, debug and create code from a text description.
What if we could take this a step further and use machine learning to accomplish the transformation of binary code back into C code? What if we could make a generic language model like ChatGPT understand the higher level structure of a program solely from it’s assembly code? Can it do this?
Having a neural network that can accomplish this transformation can provide a significant advantage in the context of machine language translations between any programming language.
A GPT-based transformer can reverse engineer binaries into C code with about 95% accuracy. If it can do this, then what is stopping us from building GPT based code optimizers that would transform unoptimized programs into a highly optimized ones?
Generating C Code
Let’s see how GPT does at understanding assembly.
Getting Control Toolbox
git clone https://github.com/swedishembedded/control
cd control
mkdir build && (cd build && cmake .. && make)
Matrix Multiplication
The first experiment I did was to try to reproduce matrix multiplication function.
I compiled the matrix multiplication test and then disassembled the multiplication function.
gdb -batch -ex 'file build/tests/linalg/linalg' -ex 'disassemble mul'
This gives us x86-64 assembly.
We can now construct our prompt:
Rewrite this disassembly listing into C function:
Dump of assembler code for function alpha:
0x000000000002920e <+0>: endbr64
0x0000000000029212 <+4>: push %rbp
0x0000000000029213 <+5>: mov %rsp,%rbp
0x0000000000029216 <+8>: mov %rdi,-0x18(%rbp)
0x000000000002921a <+12>: mov %rsi,-0x20(%rbp)
0x000000000002921e <+16>: mov %rdx,-0x28(%rbp)
0x0000000000029222 <+20>: mov %ecx,%eax
0x0000000000029224 <+22>: mov %r8d,%esi
0x0000000000029227 <+25>: mov %r9d,%ecx
0x000000000002922a <+28>: mov 0x10(%rbp),%edx
0x000000000002922d <+31>: mov %ax,-0x2c(%rbp)
0x0000000000029231 <+35>: mov %esi,%eax
0x0000000000029233 <+37>: mov %ax,-0x30(%rbp)
0x0000000000029237 <+41>: mov %ecx,%eax
0x0000000000029239 <+43>: mov %ax,-0x34(%rbp)
0x000000000002923d <+47>: mov %edx,%eax
0x000000000002923f <+49>: mov %ax,-0x38(%rbp)
0x0000000000029243 <+53>: movzwl -0x30(%rbp),%eax
0x0000000000029247 <+57>: cmp -0x34(%rbp),%ax
0x000000000002924b <+61>: je 0x29257 <mul+73>
0x000000000002924d <+63>: mov $0xffffffea,%eax
0x0000000000029252 <+68>: jmp 0x29392 <mul+388>
0x0000000000029257 <+73>: movw $0x0,-0x6(%rbp)
0x000000000002925d <+79>: jmp 0x2937f <mul+369>
0x0000000000029262 <+84>: movw $0x0,-0x4(%rbp)
0x0000000000029268 <+90>: jmp 0x29366 <mul+344>
0x000000000002926d <+95>: movzwl -0x6(%rbp),%edx
0x0000000000029271 <+99>: movzwl -0x38(%rbp),%eax
0x0000000000029275 <+103>: imul %eax,%edx
0x0000000000029278 <+106>: movzwl -0x4(%rbp),%eax
0x000000000002927c <+110>: add %edx,%eax
0x000000000002927e <+112>: cltq
0x0000000000029280 <+114>: lea 0x0(,%rax,4),%rdx
0x0000000000029288 <+122>: mov -0x18(%rbp),%rax
0x000000000002928c <+126>: add %rdx,%rax
0x000000000002928f <+129>: pxor %xmm0,%xmm0
0x0000000000029293 <+133>: movss %xmm0,(%rax)
0x0000000000029297 <+137>: movw $0x0,-0x2(%rbp)
0x000000000002929d <+143>: jmp 0x2934d <mul+319>
0x00000000000292a2 <+148>: movzwl -0x6(%rbp),%edx
0x00000000000292a6 <+152>: movzwl -0x38(%rbp),%eax
0x00000000000292aa <+156>: imul %eax,%edx
0x00000000000292ad <+159>: movzwl -0x4(%rbp),%eax
0x00000000000292b1 <+163>: add %edx,%eax
0x00000000000292b3 <+165>: cltq
0x00000000000292b5 <+167>: lea 0x0(,%rax,4),%rdx
0x00000000000292bd <+175>: mov -0x18(%rbp),%rax
0x00000000000292c1 <+179>: add %rdx,%rax
0x00000000000292c4 <+182>: movss (%rax),%xmm1
0x00000000000292c8 <+186>: movzwl -0x6(%rbp),%edx
0x00000000000292cc <+190>: movzwl -0x30(%rbp),%eax
0x00000000000292d0 <+194>: imul %eax,%edx
0x00000000000292d3 <+197>: movzwl -0x2(%rbp),%eax
0x00000000000292d7 <+201>: add %edx,%eax
0x00000000000292d9 <+203>: cltq
0x00000000000292db <+205>: lea 0x0(,%rax,4),%rdx
0x00000000000292e3 <+213>: mov -0x20(%rbp),%rax
0x00000000000292e7 <+217>: add %rdx,%rax
0x00000000000292ea <+220>: movss (%rax),%xmm2
0x00000000000292ee <+224>: movzwl -0x2(%rbp),%edx
0x00000000000292f2 <+228>: movzwl -0x38(%rbp),%eax
0x00000000000292f6 <+232>: imul %eax,%edx
0x00000000000292f9 <+235>: movzwl -0x4(%rbp),%eax
0x00000000000292fd <+239>: add %edx,%eax
0x00000000000292ff <+241>: cltq
0x0000000000029301 <+243>: lea 0x0(,%rax,4),%rdx
0x0000000000029309 <+251>: mov -0x28(%rbp),%rax
0x000000000002930d <+255>: add %rdx,%rax
0x0000000000029310 <+258>: movss (%rax),%xmm0
0x0000000000029314 <+262>: mulss %xmm2,%xmm0
0x0000000000029318 <+266>: movzwl -0x6(%rbp),%edx
0x000000000002931c <+270>: movzwl -0x38(%rbp),%eax
0x0000000000029320 <+274>: imul %eax,%edx
0x0000000000029323 <+277>: movzwl -0x4(%rbp),%eax
0x0000000000029327 <+281>: add %edx,%eax
0x0000000000029329 <+283>: cltq
0x000000000002932b <+285>: lea 0x0(,%rax,4),%rdx
0x0000000000029333 <+293>: mov -0x18(%rbp),%rax
0x0000000000029337 <+297>: add %rdx,%rax
0x000000000002933a <+300>: addss %xmm1,%xmm0
0x000000000002933e <+304>: movss %xmm0,(%rax)
0x0000000000029342 <+308>: movzwl -0x2(%rbp),%eax
0x0000000000029346 <+312>: add $0x1,%eax
0x0000000000029349 <+315>: mov %ax,-0x2(%rbp)
0x000000000002934d <+319>: movzwl -0x2(%rbp),%eax
0x0000000000029351 <+323>: cmp -0x30(%rbp),%ax
0x0000000000029355 <+327>: jb 0x292a2 <mul+148>
0x000000000002935b <+333>: movzwl -0x4(%rbp),%eax
0x000000000002935f <+337>: add $0x1,%eax
0x0000000000029362 <+340>: mov %ax,-0x4(%rbp)
0x0000000000029366 <+344>: movzwl -0x4(%rbp),%eax
0x000000000002936a <+348>: cmp -0x38(%rbp),%ax
0x000000000002936e <+352>: jb 0x2926d <mul+95>
0x0000000000029374 <+358>: movzwl -0x6(%rbp),%eax
0x0000000000029378 <+362>: add $0x1,%eax
0x000000000002937b <+365>: mov %ax,-0x6(%rbp)
0x000000000002937f <+369>: movzwl -0x6(%rbp),%eax
0x0000000000029383 <+373>: cmp -0x2c(%rbp),%ax
0x0000000000029387 <+377>: jb 0x29262 <mul+84>
0x000000000002938d <+383>: mov $0x0,%eax
0x0000000000029392 <+388>: pop %rbp
0x0000000000029393 <+389>: ret
End of assembler dump.
Note that I’m not giving it the name "mul" and instead give it the name "alpha". I don’t want it to get any hints at all as to what this function does.
The result is, well, extremely good.
int alpha(float *result, float *matrixA, float *matrixB, unsigned short m, unsigned short n, unsigned short p) {
unsigned short i, j, k;
if (n != p) {
return -22;
}
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
result[i * n + j] = 0;
for (k = 0; k < p; k++) {
result[i * n + j] += matrixA[i * p + k] * matrixB[k * n + j];
}
}
}
return 0;
}
What’s extremely amazing about this is that it has been able to identify almost everything about this code. It even named the matrices.
The explanation it gave is also quite impressive:
Here is the C function based on the provided disassembly listing:
...
This C function, called alpha, takes in three float pointers result, matrixA,
and matrixB, as well as three unsigned short integers m, n, and p. The function
multiplies the matrices matrixA and matrixB, storing the result in the result
matrix. If the inner dimensions of the matrices do not match (i.e., n is not
equal to p), the function returns -22. Otherwise, it returns 0 upon successful
completion.
Of course, none of this matters unless the code actually works. I had to make a few minor modifications. The original code takes both dimensions, in fact here is the original code:
int mul(float *C, const float *const A, const float *const B, uint16_t row_a, uint16_t column_a,
uint16_t row_b, uint16_t column_b)
{
// Data matrix
const float *data_a;
const float *data_b;
if (column_a != row_b) {
return -EINVAL;
}
for (uint16_t i = 0; i < row_a; i++) {
// Then we go through every column of b
for (uint16_t j = 0; j < column_b; j++) {
data_a = &A[i * column_a];
data_b = &B[j];
*C = 0; // Reset
// And we multiply rows from a with columns of b
for (uint16_t k = 0; k < column_a; k++) {
*C += *data_a * *data_b;
data_a++;
data_b += column_b;
}
C++;
}
}
return 0;
}
The only thing that ChatGPT got wrong was the error check (column_a != row_b). The check itself is
really not needed if we eliminate the row_b
parameter - which is what ChatGPT did. But then it
seems to have gotten confused about the presence of the check and instead compared (column_b !=
column_a) which was wrong comparison.
Another minor detail was that it discarded const
from the C code. This is
understandable since disassembly has no concept of const. It is lost during
compilation.
I made the adjustments needed to make the code compile again and also changed the comparison so that it makes sense.
int mul(float *result, const float *const matrixA, const float *const matrixB,
unsigned short row_a, unsigned short column_a, unsigned short q, unsigned short column_b) {
(void)q;
unsigned short i, j, k;
unsigned short n = column_b;
unsigned short m = row_a;
unsigned short p = column_a;
if (q != p) {
return -22;
}
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
result[i * n + j] = 0;
for (k = 0; k < p; k++) {
result[i * n + j] += matrixA[i * p + k] * matrixB[k * n + j];
}
}
}
return 0;
}
The resulting code passed all tests (build/tests/linalg/linalg)
[==========] 34 tests from 1 test suite ran. (66 ms total)
[ PASSED ] 34 tests.
At least 9 of these tests use the mul
function internally in one way or another. Very good.
Will it understand forward/backward substitution with pivoting?
Let’s try something more complex. There is a function called "solve" in
src/linalg/inv.c
. It is used for solving a system of linear equations using
forward/backward substitution specifically when doing a matrix inverse
operation. Here is the original code:
static int solve(const float *const LU, float *x, float *b, uint8_t *P, uint16_t row)
{
// forward substitution with pivoting
for (int i = 0; i < row; ++i) {
x[i] = b[P[i]];
for (int j = 0; j < i; ++j) {
x[i] = x[i] - LU[row * P[i] + j] * x[j];
}
}
// backward substitution with pivoting
for (int i = row - 1; i >= 0; --i) {
for (int j = i + 1; j < row; ++j) {
x[i] = x[i] - LU[row * P[i] + j] * x[j];
}
// Just in case if we divide with zero
if (fabsf(LU[row * P[i] + i]) > FLT_EPSILON) {
x[i] = x[i] / LU[row * P[i] + i];
} else {
return -ENOTSUP;
}
}
return 0;
}
I’ve called the function "unknown" and passed the disassembly to ChatGPT-4. This is what it came up with:
void unknown(int *output, int *input1, char *input2, int *input3, unsigned short r8d) {
int i, j, k, l;
for (i = 0; i < r8d; i++) {
for (j = 0; j < i; j++) {
input1[j] -= (input3[i * j] * output[j]);
}
for (k = 0; k < i; k++) {
for (l = 0; l < k; l++) {
input1[l] -= (input3[k * i] * output[l]);
}
input1[k] += input3[k * i] * input2[i];
}
output[i] = input1[i];
}
}
In this example ChatGPT completely messed up. The listing was quite long so I wonder if it went past the maximum number of tokens (around 8000?). The disassembly itself ended up being 8875 characters.
Let’s try matrix transpose
Let’s try something that falls within the token limits.
Matrix transpose function looks like this:
void tran(float *At, const float *const A, uint16_t row, uint16_t column)
{
float B[row * column];
const float *ptr_A = A;
for (uint16_t i = 0; i < row; i++) {
float *transpose = &B[i];
for (uint16_t j = 0; j < column; j++) {
*transpose = *ptr_A;
ptr_A++;
transpose += row;
}
}
// Copy!
memcpy(At, B, row * column * sizeof(float));
}
This ended up being 5636 characters.
Dump of assembler code for function tran:
0x000000000002dba8 <+0>: endbr64
0x000000000002dbac <+4>: push %rbp
0x000000000002dbad <+5>: mov %rsp,%rbp
0x000000000002dbb0 <+8>: push %rbx
0x000000000002dbb1 <+9>: sub $0x58,%rsp
0x000000000002dbb5 <+13>: mov %rdi,-0x48(%rbp)
0x000000000002dbb9 <+17>: mov %rsi,-0x50(%rbp)
0x000000000002dbbd <+21>: mov %ecx,%eax
0x000000000002dbbf <+23>: mov %dx,-0x54(%rbp)
0x000000000002dbc3 <+27>: mov %ax,-0x58(%rbp)
0x000000000002dbc7 <+31>: mov %fs:0x28,%rax
0x000000000002dbd0 <+40>: mov %rax,-0x18(%rbp)
0x000000000002dbd4 <+44>: xor %eax,%eax
0x000000000002dbd6 <+46>: mov %rsp,%rax
0x000000000002dbd9 <+49>: mov %rax,%rbx
0x000000000002dbdc <+52>: movzwl -0x54(%rbp),%edx
0x000000000002dbe0 <+56>: movzwl -0x58(%rbp),%eax
0x000000000002dbe4 <+60>: imul %edx,%eax
0x000000000002dbe7 <+63>: movslq %eax,%rdx
0x000000000002dbea <+66>: sub $0x1,%rdx
0x000000000002dbee <+70>: mov %rdx,-0x28(%rbp)
0x000000000002dbf2 <+74>: movslq %eax,%rdx
0x000000000002dbf5 <+77>: mov %rdx,%r10
0x000000000002dbf8 <+80>: mov $0x0,%r11d
0x000000000002dbfe <+86>: movslq %eax,%rdx
0x000000000002dc01 <+89>: mov %rdx,%r8
0x000000000002dc04 <+92>: mov $0x0,%r9d
0x000000000002dc0a <+98>: cltq
0x000000000002dc0c <+100>: lea 0x0(,%rax,4),%rdx
0x000000000002dc14 <+108>: mov $0x10,%eax
0x000000000002dc19 <+113>: sub $0x1,%rax
0x000000000002dc1d <+117>: add %rdx,%rax
0x000000000002dc20 <+120>: mov $0x10,%esi
0x000000000002dc25 <+125>: mov $0x0,%edx
0x000000000002dc2a <+130>: div %rsi
0x000000000002dc2d <+133>: imul $0x10,%rax,%rax
0x000000000002dc31 <+137>: mov %rax,%rcx
0x000000000002dc34 <+140>: and $0xfffffffffffff000,%rcx
0x000000000002dc3b <+147>: mov %rsp,%rdx
0x000000000002dc3e <+150>: sub %rcx,%rdx
0x000000000002dc41 <+153>: cmp %rdx,%rsp
0x000000000002dc44 <+156>: je 0x2dc58 <tran+176>
0x000000000002dc46 <+158>: sub $0x1000,%rsp
0x000000000002dc4d <+165>: orq $0x0,0xff8(%rsp)
0x000000000002dc56 <+174>: jmp 0x2dc41 <tran+153>
0x000000000002dc58 <+176>: mov %rax,%rdx
0x000000000002dc5b <+179>: and $0xfff,%edx
0x000000000002dc61 <+185>: sub %rdx,%rsp
0x000000000002dc64 <+188>: mov %rax,%rdx
0x000000000002dc67 <+191>: and $0xfff,%edx
0x000000000002dc6d <+197>: test %rdx,%rdx
0x000000000002dc70 <+200>: je 0x2dc82 <tran+218>
0x000000000002dc72 <+202>: and $0xfff,%eax
0x000000000002dc77 <+207>: sub $0x8,%rax
0x000000000002dc7b <+211>: add %rsp,%rax
0x000000000002dc7e <+214>: orq $0x0,(%rax)
0x000000000002dc82 <+218>: mov %rsp,%rax
0x000000000002dc85 <+221>: add $0x3,%rax
0x000000000002dc89 <+225>: shr $0x2,%rax
0x000000000002dc8d <+229>: shl $0x2,%rax
0x000000000002dc91 <+233>: mov %rax,-0x20(%rbp)
0x000000000002dc95 <+237>: mov -0x50(%rbp),%rax
0x000000000002dc99 <+241>: mov %rax,-0x38(%rbp)
0x000000000002dc9d <+245>: movw $0x0,-0x3c(%rbp)
0x000000000002dca3 <+251>: jmp 0x2dd07 <tran+351>
0x000000000002dca5 <+253>: movzwl -0x3c(%rbp),%eax
0x000000000002dca9 <+257>: cltq
0x000000000002dcab <+259>: lea 0x0(,%rax,4),%rdx
0x000000000002dcb3 <+267>: mov -0x20(%rbp),%rax
0x000000000002dcb7 <+271>: add %rdx,%rax
0x000000000002dcba <+274>: mov %rax,-0x30(%rbp)
0x000000000002dcbe <+278>: movw $0x0,-0x3a(%rbp)
0x000000000002dcc4 <+284>: jmp 0x2dcf2 <tran+330>
0x000000000002dcc6 <+286>: mov -0x38(%rbp),%rax
0x000000000002dcca <+290>: movss (%rax),%xmm0
0x000000000002dcce <+294>: mov -0x30(%rbp),%rax
0x000000000002dcd2 <+298>: movss %xmm0,(%rax)
0x000000000002dcd6 <+302>: addq $0x4,-0x38(%rbp)
0x000000000002dcdb <+307>: movzwl -0x54(%rbp),%eax
0x000000000002dcdf <+311>: shl $0x2,%rax
0x000000000002dce3 <+315>: add %rax,-0x30(%rbp)
0x000000000002dce7 <+319>: movzwl -0x3a(%rbp),%eax
0x000000000002dceb <+323>: add $0x1,%eax
0x000000000002dcee <+326>: mov %ax,-0x3a(%rbp)
0x000000000002dcf2 <+330>: movzwl -0x3a(%rbp),%eax
0x000000000002dcf6 <+334>: cmp -0x58(%rbp),%ax
0x000000000002dcfa <+338>: jb 0x2dcc6 <tran+286>
0x000000000002dcfc <+340>: movzwl -0x3c(%rbp),%eax
0x000000000002dd00 <+344>: add $0x1,%eax
0x000000000002dd03 <+347>: mov %ax,-0x3c(%rbp)
0x000000000002dd07 <+351>: movzwl -0x3c(%rbp),%eax
0x000000000002dd0b <+355>: cmp -0x54(%rbp),%ax
0x000000000002dd0f <+359>: jb 0x2dca5 <tran+253>
0x000000000002dd11 <+361>: movzwl -0x54(%rbp),%edx
0x000000000002dd15 <+365>: movzwl -0x58(%rbp),%eax
0x000000000002dd19 <+369>: imul %edx,%eax
0x000000000002dd1c <+372>: cltq
0x000000000002dd1e <+374>: lea 0x0(,%rax,4),%rdx
0x000000000002dd26 <+382>: mov -0x20(%rbp),%rcx
0x000000000002dd2a <+386>: mov -0x48(%rbp),%rax
0x000000000002dd2e <+390>: mov %rcx,%rsi
0x000000000002dd31 <+393>: mov %rax,%rdi
0x000000000002dd34 <+396>: call 0xd100 <memcpy@plt>
0x000000000002dd39 <+401>: mov %rbx,%rsp
0x000000000002dd3c <+404>: nop
0x000000000002dd3d <+405>: mov -0x18(%rbp),%rax
0x000000000002dd41 <+409>: sub %fs:0x28,%rax
0x000000000002dd4a <+418>: je 0x2dd51 <tran+425>
0x000000000002dd4c <+420>: call 0xd320 <__stack_chk_fail@plt>
0x000000000002dd51 <+425>: mov -0x8(%rbp),%rbx
0x000000000002dd55 <+429>: leave
0x000000000002dd56 <+430>: ret
End of assembler dump.
The result of ChatGPT transforming this into C code was this:
The function 'foo' takes three arguments: a destination pointer, a source pointer to a matrix of
floats, and the number of rows and columns of the matrix. The function transposes the matrix and
stores the result in the destination pointer.
#include <stdint.h>
#include <string.h>
void foo(void *dest, float *src, uint16_t rows, uint16_t cols) {
int32_t i, j;
int32_t product = rows * cols;
float *temp = (float *)(((uintptr_t)&product + 0x3) & ~(uintptr_t)0x3);
for (i = 0; i < rows; i++) {
for (j = 0; j < cols; j++) {
temp[i * cols + j] = src[j * rows + i];
}
}
memcpy(dest, temp, product * sizeof(float));
}
It made one small mistake: it swapped temp
and src
.
The order should have been: temp[j * rows +i] = src[i * cols + j]
Beyond this, the code works as expected. I even had to add one additional test that would verify this fact.
ChatGPT Compiler?
Let’s see if it can transform C code into asm. We are going to use inline asm and help it a little because I want something I can just compile and verify quickly.
Translate this C function into another C function that is written entirely using inline GCC
assembly:
<mul function C code>
The result was this:
#include <stdint.h>
int mul(float *result, const float *const matrixA, const float *const matrixB,
uint16_t row_a, uint16_t column_a, uint16_t q, uint16_t column_b) {
int ret_val;
asm volatile (
"cmp %5, %6;"
"jne .error;"
"mov %3, %%cx;"
"xor %%r8d, %%r8d;"
".row_loop:"
" mov %4, %%dx;"
" xor %%r9d, %%r9d;"
" .col_loop:"
" xor %%eax, %%eax;"
" mov %6, %%r10d;"
" xor %%r11d, %%r11d;"
" .mul_loop:"
" cvtss2sd (%1, %%r8, 4), %%xmm0;"
" cvtss2sd (%2, %%r11, 4), %%xmm1;"
" mulsd %%xmm1, %%xmm0;"
" addsd %%xmm0, %%xmm2;"
" inc %%r11d;"
" add %%r10d, %2;"
" dec %6;"
" jnz .mul_loop;"
" cvtsd2ss %%xmm2, (%0, %%r9, 4);"
" xorpd %%xmm2, %%xmm2;"
" inc %%r9d;"
" sub %%r10d, %2;"
" dec %%dx;"
" jnz .col_loop;"
" inc %%r8d;"
" add %6, %1;"
" dec %%cx;"
"jnz .row_loop;"
"xor %%eax, %%eax;"
"jmp .done;"
".error:"
" mov $-22, %%eax;"
".done:"
: "=r"(ret_val), "+r"(matrixA), "+r"(matrixB), "+r"(row_a), "+r"(column_b), "+r"(column_a), "+r"(q)
: "0"(result)
: "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "xmm0", "xmm1", "xmm2", "cc", "memory"
);
return ret_val;
}
This unfortunately did not compile besides matrixA and matrixB being declared as "output" variables, the following errors also came up when trying to compile it:
src/linalg/mul.c: Assembler messages:
src/linalg/mul.c:36: Error: unsupported instruction `mov'
src/linalg/mul.c:36: Error: operand type mismatch for `add'
src/linalg/mul.c:36: Error: `(%r15d,%r9,4)' is not a valid base/index expression
src/linalg/mul.c:36: Error: operand type mismatch for `sub'
src/linalg/mul.c:36: Error: operand type mismatch for `add'
The nasty thing about this GPT style inline assembly is that to the compiler it all appears as one line making the errors impossible to locate quickly. So I decided it was not worth the time trying to debug. Feel free to try to figure out what went wrong.
Key Takeaways
The main problem with using any kind of "transformer" model is that cleaning up the result is often required and this relies heavily on your own experience.
What makes things tricky is when the model comes up with a result and it is too complex to comprehend easily. In such situations, even small errors can be very time consuming to fix because you have to debug the model’s output first. The examples in this article are simple enough and yet small errors by ChatGPT almost make debugging longer than simply looking up a solution and coding it from scratch.
It helps a lot to have integration tests. This acts as the primary way of validating the results of the model. You need to have all of your expectations documented as tests in order to use fully automated generation and being able to validate it.
If you can check the results and feed back the results to GPT as part of the conversation, there is a possibility it will be able to fix some problems based on your feedback - but not all.
Questions
Does it matter if additional context is added?
I have tried providing the model with a doxygen comment and prototype of the function to be disassembled which was phenomenal.
Rewrite this disassembly listing into C function:
Dump of assembler code for function mul:
[dump]
End of assembler dump.
The function must have the following prototype:
/**
* \brief C = A * B
* \param C Output Matrix [row_a*column_b]
* \param A Input matrix [row_a*column_a]
* \param B Input matrix [column_a*column_b]
* \param row_a Number of rows in A
* \param column_a Number of columns in A
* \param row_b Number of rows in B
* \param column_b Number of columns in B (and rows in C)
**/
int mul(float *C, const float *const A, const float *const B, uint16_t row_a, uint16_t column_a,
uint16_t row_b, uint16_t column_b);
This generated flawless result from first try:
#include <stdint.h>
/**
* \brief C = A * B
* \param C Output Matrix [row_a*column_b]
* \param A Input matrix [row_a*column_a]
* \param B Input matrix [column_a*column_b]
* \param row_a Number of rows in A
* \param column_a Number of columns in A
* \param row_b Number of rows in B
* \param column_b Number of columns in B (and rows in C)
**/
int mul(float *C, const float *const A, const float *const B, uint16_t row_a, uint16_t column_a,
uint16_t row_b, uint16_t column_b) {
if (column_a != row_b) {
return -22; // EINVAL
}
for (uint16_t i = 0; i < row_a; i++) {
for (uint16_t j = 0; j < column_b; j++) {
C[i * column_b + j] = 0;
for (uint16_t k = 0; k < column_a; k++) {
C[i * column_b + j] += A[i * column_a + k] * B[k * column_b + j];
}
}
}
return 0;
}
It could be that the model is able to infer a lot of the functionality. In fact, just asking it to "Implement the following function in C" and giving it the documented prototype does result in similar code:
#include <stdint.h>
/**
* \brief C = A * B
* \param C Output Matrix [row_a*column_b]
* \param A Input matrix [row_a*column_a]
* \param B Input matrix [column_a*column_b]
* \param row_a Number of rows in A
* \param column_a Number of columns in A
* \param row_b Number of rows in B
* \param column_b Number of columns in B (and rows in C)
**/
int mul(float *C, const float *const A, const float *const B, uint16_t row_a, uint16_t column_a,
uint16_t row_b, uint16_t column_b) {
// Check if matrix dimensions are compatible for multiplication
if (column_a != row_b) {
return -1; // Return error code if dimensions are not compatible
}
// Perform matrix multiplication
for (uint16_t i = 0; i < row_a; i++) {
for (uint16_t j = 0; j < column_b; j++) {
float sum = 0;
for (uint16_t k = 0; k < column_a; k++) {
sum += A[i * column_a + k] * B[k * column_b + j];
}
C[i * column_b + j] = sum;
}
}
return 0; // Return success code
}
However it is a little bit different. For example it returns -1 instead of -22. The logic is similar though. If the model is really able to infer meaning then it did a pretty good job inferring the correct code layout and values from the assembly code.
Subscribe
If you liked this post, become a subscriber at Swedish Embedded Group: https://swedishembedded.com/
Getting a premium subscription has several benefits, including:
- Access to all course materials (written + video).
- Live QnA calls each Wednesday (as long as countdown exists on the site).
If you haven’t subscribed already, you should definitely get the paid membership.