Solving Ben_Lolo's WeeperVM Level 1

23 Apr, 2025

Link : https://crackmes.one/crackme/67f9bdc38f555589f3530a85

$ file WeeperVM--Level_1
WeeperVM--Level_1: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, stripped

ELF thats pretty easy to decompile

main starts by initializing a bunch of threads with anti-debugging measures like

Look for TracerPid in /proc/self/status
Locate .text in /proc/self/mapsand calculate a checksum from proc/self/mem for it
Some sort of checksum verification from memory and /proc/self/exe

The strings are encrypted with xor

char *__fastcall xor_dec(char *encryp, int out_len, const char *key, int key_len)
{
  char *result; // rax
  __int64 i; // rcx

  result = (char *)calloc(out_len + 1, 1u);
  for ( i = 0; out_len > (int)i; ++i )
    result[i] = encryp[i] ^ key[(int)i % key_len];
  result[out_len] = 0;
  return result;
}

Just used frida to dump threadId and strings like this

const module = "crackme";
const baseAddr = Module.findBaseAddress(module);
const funcAddr = baseAddr.add(0x0e27);
var seen = new Set();
Interceptor.attach(funcAddr, {
    onEnter: function (args) {
    },
    onLeave: function (retval) {
        var callee = this.context.rsp.readPointer().sub(baseAddr).add(0x10000).toString(16);
        if (seen.has(callee)) {
            return;
        }
        seen.add(callee);
        console.log(callee, this.threadId, retval.readCString());
    }
});

for a quick

10b1a 1545 r
10b36 1545 /proc/self/maps
10b8b 1545 r-xp
10bb4 1545 %lx-%lx
10c10 1545 rb
10c2c 1545 /proc/self/mem
11155 1554 r
11171 1554 /proc/self/status
111c5 1554 TracerPid:
11007 1545 rb
11023 1545 /proc/self/exe

At the end of main is the vm interpreter loop

  while ( 1 )
  {
    vip = vip;
    ins = bytecode[vip];
    if ( !ins )
      break;
    if ( execute_vm_weep(ins, v2) )
      return 1;
    vip = vip + 1;
  }

The function is too large to paste here. VM features include

16 general purpose registers - r14 and r15 were mostly used to jump
flags register for compare with bits for equality, less/greater than
256 bytes of scratch space - init with values
Flag is read to 0x40 idx in the scratch space
The opcodes are

OPCODES = {
    0x0: "MOV",
    0x1: "MOVB",
    0x2: "ADD",
    0x3: "SUB",
    0x4: "MUL",
    0x5: "MOD",
    0x6: "CALL",
    0x7: "READ_INPUT",
    0x9: "CMP",
    0xA: "JMP",
    0xB: "AND",
    0xC: "OR",
    0xD: "XOR",
    0xE: "SHL",
    0xF: "SHR",
}

I wrote a disassembler with reversing

def disassemble(instruction):
    def byte(n): return (instruction >> (8 * n)) & 0xFF

    # XOR mutation like in the code
    modified = instruction ^ (0x1010101 * byte(2))

    opcode = (modified >> 28) & 0xF
    mem_sel = (modified >> 27) & 0x1          # 1 = special area (offset 77600), 0 = dword_13080
    mode = (modified >> 25) & 0x3             # 0 = direct, 1 = from buffer, 2 = immediate
    index = (modified >> 8) & 0xFF            # index byte (BYTE1)
    xor_byte = byte(2)
    src_byte = instruction ^ xor_byte         # used for source/indexing buffer

    op_str = OPCODES.get(opcode, f"UNKNOWN_{opcode}")

    # Create operand descriptions
    # dst = f"{'BUF' if mem_sel else 'R'}{index}"
    dst = "BUF[{}]".format(index) if mem_sel else f"R{index}"

    if mode == 0:
        src = f"R{src_byte & 0xFF}"
    elif mode == 1:
        src = f"[BUF][{src_byte & 0xFF}]:({buf[src_byte & 0xFF]}| '{chr(buf[src_byte & 0xFF]) if buf[src_byte & 0xFF] < 127 else ''}')"
    elif mode == 2:
        src = f"i{src_byte & 0xFF}"
    else:
        src = "??"

    if opcode == 0x0 and mode == 2 and mem_sel == 0:
            const_register_bank[dst] = int(src_byte & 0xFF)
            return f"{op_str} {dst}, {src} | {const_register_bank.get(dst, 'UNKNOWN')}"
    if opcode == 0x2 and mode == 2 and mem_sel == 0:
            const_register_bank[dst] += int(src_byte & 0xFF)
            return f"{op_str} {dst}, {src} | {const_register_bank.get(dst, 'UNKNOWN')}"

    # Handle special opcodes
    if opcode == 0x6:
        return f"{op_str} putchar('{chr(buf[index]) if buf[index] != 10 else '\\n'}')"
    elif opcode == 0x7:
        return f"{op_str} READ_INPUT to BUF[{index}] len={src}"
    elif opcode == 0x9:
        return f"{op_str} {dst} vs {src}"
    elif opcode == 0xA:
        return f"{op_str} IF (flags & {src}) -> jmp {dst}|{const_register_bank.get(dst, "UNKNOWN")}"
    elif opcode == 0x1:
        if mem_sel == 0:
            return f"{op_str} {dst}, BUF[{src}]"
        else:
            return f"{op_str} BUF[{dst}], {src}"
    else:
        return f"{op_str} {dst}, {src}"

const_register_bank will see when the values are of this pattern

511: 17131DEC: MOV R14, i255
512: B89D9362: ADD R14, i255
513: 17323C0E: ADD R14, i60

And then annotate the disassembly with consts.

Also read the code and started labelling

label_db = {
    779: "SUCCESS",
    796: "FAILURE",
    363: "CONST_390",
    393: "CONST_362",
    367: "CONST_796",
    391: "CONST_398",
    399: "CONST_401",
    412: '''[f"{a[160+i]:02X}^{a[176+i]:02X}={a[160+i]^a[176+i]:02X}" for i in range(16)]''',
    435: f"CONST_{255+197}",
    437: f"CONST_{255+206}",
}

buf is the 256 byte region that was in the .lotus in the binary

A total of 821 instructions and the end had a check

682: DA46069A: CMP BUF[64] vs i220|
683: 57F2FCF0: JMP IF (flags & i2) -> jmp R14|796|
684: 17B3BDB7: JMP IF (flags & i4) -> jmp R14|796|
685: 20BCFDD3: CMP BUF[65] vs i111|
686: E5404E42: JMP IF (flags & i2) -> jmp R14|796|
687: FE5A545E: JMP IF (flags & i4) -> jmp R14|796|
688: C05D1F0B: CMP BUF[66] vs i86|
689: 06A3ADA1: JMP IF (flags & i2) -> jmp R14|796|
690: 55F1FFF5: JMP IF (flags & i4) -> jmp R14|796|

BUF[64] is where our input was written to - there were some misunderstanding and I missed the mechanics of the movb instruction. So I wrote a pintool to track read/writes to the memory in interesting regions

ADDRINT region_start = 0x12F20;
ADDRINT region_end = 0x13020;

ADDRINT register_start = 0x13080;
ADDRINT register_end = 0x130C0;

ADDRINT flags_start = 0x13060;
ADDRINT flags_end = 0x13062;
VOID RecordMemRead(ADDRINT addr, UINT32 size)
{
    UINT64 value = 0;
    if ((ADDRINT)addr >= l + region_start && (ADDRINT)addr < l + region_end)
    {
        PIN_SafeCopy(&value, reinterpret_cast<VOID *>(addr), size);
        outFile << "READMEM" << size << "@" << " -> " << std::dec << addr - l - region_start << ": " << std::hex << value << std::endl;
    }
    if ((ADDRINT)addr >= l + register_start && (ADDRINT)addr < l + register_end)
    {
        PIN_SafeCopy(&value, reinterpret_cast<VOID *>(addr), size);
        outFile << "READREG" << size << "@" << " -> "  << std::dec << (addr - l - register_start) / size << ": " << std::hex << value << std::endl;
    }
    if ((ADDRINT)addr >= l + flags_start && (ADDRINT)addr < l + flags_end)
    {
        PIN_SafeCopy(&value, reinterpret_cast<VOID *>(addr), size);
        outFile << "READFLAGS" << size << "@" << " -> "  << std::dec << (addr - l - flags_start) / size << ": " << std::hex << value << std::endl;
    }
}

VOID RecordMemWrite(ADDRINT addr, UINT32 size, ADDRINT ip)
{
    if ((ADDRINT)addr >= region_start && (ADDRINT)addr < region_end)
    {
        outFile << ip << "|WRITEMEM" << size << "@" << " -> " << std::dec << addr - l - region_start << std::endl;
    }
    if ((ADDRINT)addr >= register_start && (ADDRINT)addr < register_end)
    {
        outFile << ip << "|WRITEREG" << size << "@" << " -> " << std::dec << (addr - l - register_start) / size << std::endl;
    }
    if ((ADDRINT)addr >= flags_start && (ADDRINT)addr < flags_end)
    {
        outFile << ip << "|WRITEFLAGS" << size << "@" << " -> " << std::dec << (addr - l - flags_start) / size << std::endl;
    }
}

// usage
    if (INS_IsMemoryRead(ins))
    {
        INS_InsertPredicatedCall(ins, IPOINT_BEFORE, (AFUNPTR)RecordMemRead,
                                 IARG_MEMORYREAD_EA, IARG_MEMORYREAD_SIZE, IARG_END);
    }
    if (INS_IsMemoryWrite(ins))
    {
        INS_InsertPredicatedCall(ins, IPOINT_BEFORE, (AFUNPTR)RecordMemWrite,
                                 IARG_MEMORYWRITE_EA, IARG_MEMORYWRITE_SIZE, IARG_INST_PTR, IARG_END);
    }

This helped me a lot to validate my assumptions and fix my disassembler such that I had the correct disassembly very quickly.

The next step was to build a CFG and try to make an educated guess on the structure of the program. So I did some crude basic block analysis and built the CFG with python.

import re
import logging

def load_disasm(path):
    disasm = {}
    with open(path, 'r') as f:
        for line in f:
            match = re.match(r'^([0-9]+):\s[0-9a-fA-F]+:\s(.*)', line)
            if match:
                addr = int(match.group(1))
                mnemonic = match.group(2).strip()
                disasm[addr] = mnemonic
    return disasm

def load_trace(path):
    with open(path, 'r') as f:
        trace = []
        for line in f:
            match = re.match(r'^([0-9]+): .*', line)
            if match:
                trace.append(int(match.group(1)))
    return trace

disasm = load_disasm("diss")
trace = load_trace("pin.out")

# print(trace)
# print(disasm)

def build_simple_cfg(trace, disasm):
    blocks = set(disasm.keys())
    # blacklist direct jumps
    bl = set([389, 397, 451, 520, 527, 534, 541, 548, 555, 562, 569, 585])
    edges = set() # its a trace - don't create duplicate edges
    # no jumps trace
    for i in blocks:
        if i in bl:
            continue
        if i+1 in blocks:
            edges.add((i, i+1))

    # trace with jumps
    for i in range(len(trace) - 1):
        src = trace[i]
        dst = trace[i + 1]
        if src in blocks and dst in blocks:
            edges.add((src, dst))
        else:
            logging.warning(f"Invalid edge {src} -> {dst}")
    return blocks, edges

from graphviz import Digraph

def write_simple_dot(blocks, edges, out_path='simple.dot'):
    dot = Digraph(name="CFG", format='dot')
    dot.attr('node', shape='box')

    # Add basic blocks with proper labels
    for ip in blocks:
        dot.node(str(ip))

    # Add edges
    for src, dst in edges:
        dot.edge(str(src), str(dst))

    dot.save(filename=out_path)

basic_blocks, edges = build_simple_cfg(trace, disasm)
write_simple_dot(basic_blocks, edges)

import networkx as nx
from networkx.drawing.nx_pydot import read_dot, write_dot

def condense_linear_blocks(disasm, dot_input='simple.dot', dot_output='condensed.dot'):
    G = read_dot(dot_input)
    G = nx.DiGraph(G)  # ensure it's directed

    skip = set()

    for node in list(G.nodes):
        if node in skip or node not in G:
            continue

        path = [node]
        current = node

        while True:
            # preds = list(G.predecessors(current))
            succs = list(G.successors(current))

            # break if not exactly one successor or the successor has multiple preds
            if len(succs) != 1:
                break
            next_node = succs[0]
            if len(list(G.predecessors(next_node))) != 1:
                break
            path.append(next_node)
            current = next_node
            skip.add(current)

        if len(path) > 1:
            new_node = "\n".join(n + "|" + disasm.get(int(n), "???") for n in path).replace(':', '-')
            G.add_node(new_node)

            # edges from predecessors of first node
            for pred in G.predecessors(path[0]):
                G.add_edge(pred, new_node)

            # edges to successors of last node
            for succ in G.successors(path[-1]):
                G.add_edge(new_node, succ)

            for n in path:
                G.remove_node(n)
        else:
            new_node = path[0] + "|" + disasm.get(int(path[0]), "???")
            G.add_node(new_node)
            for pred in G.predecessors(path[0]):
                G.add_edge(pred, new_node)
            for succ in G.successors(path[0]):
                G.add_edge(new_node, succ)
            G.remove_node(path[0])

    write_dot(G, dot_output)

condense_linear_blocks(disasm)

This script loads the disassembly and the pintool trace to build linkages for jump targets and builds a dot file which I used in the browser to build an idea of what the code is doing.

All this could have been avoided if I would have written a CPU module in ghidra

I then fixed up the disassembler to generate as close to python as possible and then manually fixed the loops, if/else in the code to get this

def vm_main():
    global buf
    r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 = [BitVecVal(0, 8)] * 16
    r12 = 0
    while r12 < 16:
        r2 = 160
        r2 += r12
        r0 = buf[r2]  # MOVB R0, BUF[R2]
        r2 += 16      # r2 = 176 + r12
        r1 = buf[r2]  # MOVB R1, BUF[R2]
        r2 += 16      # r2 = 192 + r12
        buf[240] = r2  # MOV BUF[240], R2
        r0 ^= r1      # XOR R0, R1
        buf[buf[240]] = r0  # MOVB BUF[BUF[240]], R0
        buf[r2] = r0
        r12 += 1

    for r3 in range(0, 32, 16):
        r0 = 0
        for r12 in range(16):
            r1 = r0
            for r13 in range(8):
                r11 = 72
                r11 += r3
                r11 += r13
                r2 = buf[r11]  # MOVB R2, BUF[R11]
                r6 = r2
                r6 &= 240
                r6 >>= 4
                r7 = r2
                r7 &= 15
                r11 = 192
                r11 += r1
                r4 = buf[r11]  # MOVB R4, BUF[R11]
                r11 = 151
                r11 -= r13
                r5 = buf[r11]  # MOVB R5, BUF[R11]
                r4 &= r5
            
                if r4 != 0:
                    r4 = 128
                    r4 += r6
                    r2 = buf[r4]  # MOVB R2, BUF[R4]
                    r2 &= 15
                    r2 <<= 4
                    r4 = 128
                    r4 += r7
                    r5 = buf[r4]  # MOVB R5, BUF[R4]
                    r5 &= 240
                    r5 >>= 4
                    r2 |= r5
                else:
                    r4 = 128
                    r4 += r6
                    r2 = buf[r4]  # MOVB R2, BUF[R4]
                    r2 &= 240
                    r4 = 128
                    r4 += r7
                    r5 = buf[r4]  # MOVB R5, BUF[R4]
                    r5 &= 15
                    r2 |= r5
                
                r4 = 192
                r4 += r0
                r5 = buf[r4]  # MOVB R5, BUF[R4]
                r2 ^= r5
                r4 = r2
                r2 = 0
                
                r5 = 144
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 >>= 3
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 >>= 4
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 <<= 2
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 >>= 1
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 <<= 2
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 <<= 4
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 >>= 1
                r2 |= r6
                
                r5 += 1
                r6 = buf[r5]  # MOVB R6, BUF[R5]
                r6 &= r4
                r6 <<= 1
                r2 |= r6
                
                r11 = 64
                r11 += r3
                r10 = 144
                r9 = 152
                
                # Based on the value of r13, jump to different parts
                r4 = r13
                r4 += 7
                r4 %= 8
                
                # Execution continues at label 570
                for kk in range(8):
                    r5 = r2
                    r6 = r11
                    r6 += r4
                    r6 = buf[r6]  # MOVB R6, BUF[R6]
                    r7 = buf[r10]  # MOVB R7, BUF[R10]
                    r8 = buf[r9]  # MOVB R8, BUF[R9]
                    r5 ^= r6
                    r5 &= r7
                    r6 &= r8
                    r5 |= r6
                    buf[240] = r11
                    buf[240] += r4
                    buf[buf[240]] = r5  # MOVB BUF[BUF[240]], R5
                    buf[r11+r4] = r5
                    r10 += 1
                    r9 += 1
                    if kk == 0:
                        r4 = r13
                        r4 += 6
                        r4 %= 8
                    elif kk == 1:
                        r4 = r13
                        r4 += 2
                        r4 %= 8
                    elif kk == 2:
                        r4 = r13
                        r4 += 1
                        r4 %= 8
                    elif kk == 3:
                        r4 = r13
                        r4 += 5
                        r4 %= 8
                    elif kk == 4:
                        r4 = r13
                        r4 += 0
                        r4 %= 8
                    elif kk == 5:
                        r4 = r13
                        r4 += 3
                        r4 %= 8
                    elif kk == 6:
                        r4 = r13
                        r4 += 4
                        r4 %= 8
                
                # Check if r13 == 7
                if r13 < 7:
                    r0 += 1
                    r0 %= 16

        
            # Swap bytes in a range of memory
            r4 = 64
            r4 += r3
            r5 = 72
            r5 += r3
            r13 = 0
            
            while r13 < 8:
                r6 = buf[r4]  # MOVB R6, BUF[R4]
                r7 = buf[r5]  # MOVB R7, BUF[R5]
                buf[240] = r4
                buf[buf[240]] = r7  # MOVB BUF[BUF[240]], R7
                buf[r4] = r7
                buf[240] = r5
                buf[buf[240]] = r6  # MOVB BUF[BUF[240]], R6
                buf[r5] = r6
                r4 += 1
                r5 += 1
                r13 += 1
        
        # r12 += 1
        # if r12 == 16:
        #     # End inner loop, move to next r3
        #     continue
        
        # Another swap operation
        r4 = 64
        r4 += r3
        r5 = 72
        r5 += r3
        r13 = 0
        
        while r13 < 8:
            r6 = buf[r4]  # MOVB R6, BUF[R4]
            r7 = buf[r5]  # MOVB R7, BUF[R5]
            buf[240] = r4
            buf[buf[240]] = r7  # MOVB BUF[BUF[240]], R7
            buf[r4] = r7
            buf[240] = r5
            buf[buf[240]] = r6  # MOVB BUF[BUF[240]], R6
            buf[r5] = r6
            r4 += 1
            r5 += 1
            r13 += 1
        
        # Move data around in memory
        r13 = 0
        while r13 < 16:
            r4 = r13
            r4 += 9
            r4 %= 16
            r5 = 192
            r5 += r13
            r6 = 208
            r6 += r4
            r4 = buf[r5]  # MOVB R4, BUF[R5]
            buf[240] = r6
            buf[buf[240]] = r4  # MOVB BUF[BUF[240]], R4
            buf[r6] = r4
            r13 += 1
        
        # More memory operations
        r13 = 0
        while r13 < 16:
            r4 = 192
            r4 += r13
            r5 = 208
            r5 += r13
            r6 = buf[r5]  # MOVB R6, BUF[R5]
            buf[240] = r4
            buf[buf[240]] = r6  # MOVB BUF[BUF[240]], R6
            buf[r4] = r6
            r13 += 1

So the overall logic is to transform the input in chunks of 16 bytes and then compare the output to a known transformed value

682: DA46069A: CMP BUF[64] vs i220|
683: 57F2FCF0: JMP IF (flags & i2) -> jmp R14|796|
684: 17B3BDB7: JMP IF (flags & i4) -> jmp R14|796|
685: 20BCFDD3: CMP BUF[65] vs i111|
686: E5404E42: JMP IF (flags & i2) -> jmp R14|796|
687: FE5A545E: JMP IF (flags & i4) -> jmp R14|796|
688: C05D1F0B: CMP BUF[66] vs i86|
689: 06A3ADA1: JMP IF (flags & i2) -> jmp R14|796|
690: 55F1FFF5: JMP IF (flags & i4) -> jmp R14|796|

Without looking at the structure of the transformations I symbolized everything and let the z3 script run overnight.

So then I manually started to analyze and rename parts of the code, with removing parts of the code and registers - the manual decompiler pipeline. I also fixed up the pintool to yield the transformed values and I built a db of known transformations - input to output from the binary I just fixed the cmp instruction in the pintool to not fail any byte check for the transformed output based on the value of the vip register.

void count_eq(CONTEXT *ctxt)
{
    ADDRINT ecx = PIN_GetContextReg(ctxt, REG_RCX);
    ADDRINT esi = PIN_GetContextReg(ctxt, REG_RSI);
    if (fix_check) {
        outFile << "CMP " << "ECX: 0x" << std::hex << ecx << " | ESI: 0x" << esi << std::endl;
        ADDRINT regval;
        PIN_GetContextRegval(ctxt, REG_EFLAGS, reinterpret_cast<UINT8 *>(&regval));
        regval = regval | (1 << 6);
        PIN_SetContextRegval(ctxt, REG_EFLAGS, reinterpret_cast<UINT8 *>(&regval));
        PIN_ExecuteAt(ctxt);
    }
}
void dump_ins(CONTEXT *ctxt)
{
    ADDRINT rax = PIN_GetContextReg(ctxt, REG_RAX);
    if (rax >= 682 && rax <= 775) {
        fix_check = TRUE;
    } else {
        fix_check = FALSE;
    }
}

    // usage
    if (INS_Address(ins) == l + 0x11798)
    {
        INS_InsertCall(ins, IPOINT_AFTER, (AFUNPTR)count_eq, IARG_CONTEXT, IARG_END);
    }
    if (INS_Address(ins) == l + 0x11B85)
    {
        INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)dump_ins, IARG_CONST_CONTEXT, IARG_END);
    }

Before I got to the point of the disassembly I thought that maybe I could side channel it - by counting the cmp instruction. But after the decompilation I could see diffusion, permutations in a way that a single bit reflected in all of the output bytes.

I also lifted the code to LLVM IR and fed it to KLEE to see if I could solve it - but that also did not pan out very well.

So the only option left was to analyze the transformation and optimize the code to get to a more readable state - this is where I spent most of my time.

Eventually coming to

def feistel_transform(input_str, second_round=False):
    s_box = [199, 242, 126, 169, 227, 219, 176, 4, 44, 109, 49, 26, 150, 79, 88, 133]
    round_keys = [70, 252, 99, 237, 166, 65, 135, 113, 220, 141, 64, 156, 218, 97, 218, 26]
    if second_round:
        round_keys = round_keys[7:] + round_keys[:7]
    assert len(input_str) == 16
    byte_data = list(bytearray(input_str, "utf-8"))
    left_half, right_half = byte_data[:8], byte_data[8:]
    round_idx = 0
    for _ in range(16):
        current_idx = round_idx

        for nibble_idx in range(8):
            high_nibble = (right_half[nibble_idx] & 0xF0) >> 4
            low_nibble = right_half[nibble_idx] & 0x0F

            swap_nibbles = round_keys[current_idx] & (1 << nibble_idx)
            transformed_high = s_box[high_nibble]
            transformed_low = s_box[low_nibble]
            if swap_nibbles != 0:
                transformed_high &= 0x0F
                transformed_high <<= 4
                transformed_low &= 0xF0
                transformed_low >>= 4
            else:
                transformed_high &= 0xF0
                transformed_low &= 0x0F
            round_output = transformed_high | transformed_low

            round_output ^= round_keys[round_idx]
            permutation_map = [7, 6, 2, 1, 5, 0, 3, 4]
            bit_swap_rules = [
                (0, 1), (1, 0), (2, 6), (3, 5), (4, 3), (5, 7), (6, 2), (7, 4), ]
            swap_dict = {dest: src for src, dest in bit_swap_rules}
            for perm_idx in range(8):
                target_idx = (nibble_idx + permutation_map[perm_idx]) % 8
                left_half[target_idx] ^= (
                    (round_output & (1 << swap_dict[7 - perm_idx]))
                    >> swap_dict[7 - perm_idx]
                ) << (7 - perm_idx)
            if nibble_idx < 7:
                round_idx = (round_idx + 1) % 16
        left_half, right_half = right_half, left_half

    left_half, right_half = right_half, left_half

    return left_half + right_half

Yes. Its a feistel like construction - I optimized the byte swaps and operations to halfs. I also looked at the bit permutation and how each byte of right half was tranformed, permuted and xored to bits of the left half. I did a lot of reading in the meanwhile trying to guess if this is a known algo.

Once I knew its feistel I could invert it - we just had to invert the order of the round_keys is what I called.

def i_feistel_transform(input, second_round=False):
    s_box = [199, 242, 126, 169, 227, 219, 176, 4, 44, 109, 49, 26, 150, 79, 88, 133]
    round_keys = [ 70, 252, 99, 237, 166, 65, 135, 113, 220, 141, 64, 156, 218, 97, 218, 26]
    if second_round:
        round_keys = round_keys[7:] + round_keys[:7]
    assert len(input) == 16
    byte_data = list(input)
    left_half, right_half = byte_data[:8], byte_data[8:]

    round_idx = 0
    for xx in range(16):
        current_idx = [9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 5, 14, 7, 0][xx]

        for nibble_idx in range(7, -1, -1):
            high_nibble = (right_half[nibble_idx] & 0xF0) >> 4
            low_nibble = right_half[nibble_idx] & 0x0F

            swap_nibbles = round_keys[current_idx] & (1 << nibble_idx)
            transformed_high = s_box[high_nibble]
            transformed_low = s_box[low_nibble]
            if swap_nibbles != 0:
                transformed_high &= 0x0F
                transformed_high <<= 4
                transformed_low &= 0xF0
                transformed_low >>= 4
            else:
                transformed_high &= 0xF0
                transformed_low &= 0x0F
            round_output = transformed_high | transformed_low

            round_output ^= round_keys[round_idx]
            permutation_map = [7, 6, 2, 1, 5, 0, 3, 4]
            bit_swap_rules = [
                (0, 1), (1, 0), (2, 6), (3, 5), (4, 3), (5, 7), (6, 2), (7, 4), ]
            swap_dict = {dest: src for src, dest in bit_swap_rules}
            for perm_idx in range(7, -1, -1):
                target_idx = (nibble_idx + permutation_map[perm_idx]) % 8
                left_half[target_idx] ^= (
                    (round_output & (1 << swap_dict[7 - perm_idx]))
                    >> swap_dict[7 - perm_idx]
                ) << (7 - perm_idx)
            if nibble_idx != 0:
                round_idx = (round_idx - 1) % 16
        left_half, right_half = right_half, left_half

    left_half, right_half = right_half, left_half

    return left_half + right_half

if __name__ == "__main__":
    expected = [0xDC,0x6F,0x56,0xC2,0xD3,0x4E,0x50,0x77,0x69,0xB2,0x4D,0xE9,0x8F,0xF5,0x2D,0x12,0x2A,0x95,0x81,0xE5,0x35,0xDB,0x8A,0x10,0x14,0xFF,0xA7,0x4,0xD0,0xC1,0xE5,0x74]
    print(
        "".join(
            map(
                chr, i_feistel_transform(expected[:16])
                + i_feistel_transform(expected[16:32], True), )
        )
    )

Which gave us the flag

()v3Ng34nce_I5_4n_1D10T5_g4m3_()