Source code for jawa.util.bytecode

"""
Utilities for reading & writing JVM method bytecode.
"""
import json
import enum
import pkgutil
from struct import unpack, pack, Struct
from itertools import repeat
from collections import namedtuple

Operand = namedtuple('Operand', ['op_type', 'value'])
_Instruction = namedtuple('Instruction', [
    'mnemonic',
    'opcode',
    'operands',
    'pos'
])


[docs]class Instruction(_Instruction):
    """
    Represents a single JVM instruction, consisting
    of an opcode and its potential operands.
    """
    __slots__ = ()

[docs]    def size_on_disk(self, start_pos=0):
        """
        Returns the size of this instruction and its operands when
        packed. `start_pos` is required for the `tableswitch` and
        `lookupswitch` instruction as the padding depends on alignment.
        """
        # All instructions are at least 1 byte (the opcode itself)
        size = 1
        fmts = opcode_table[self.opcode]['operands']

        if self.wide:
            size += 2
            # Special case for iinc which has a 2nd extended operand.
            if self.opcode == 0x84:
                size += 2
        elif fmts:
            # A simple opcode with simple operands.
            for fmt, _ in fmts:
                size += fmt.value.size
        elif self.opcode == 0xAB:
            # lookupswitch
            padding = 4 - (start_pos + 1) % 4
            padding = padding if padding != 4 else 0
            size += padding
            # default & npairs
            size += 8
            size += len(self.operands[0]) * 8
        elif self.opcode == 0xAA:
            # tableswitch
            raise NotImplementedError()

        return size

    @property
    def wide(self):
        """
        ``True`` if this instruction needs to be prefixed by the WIDE
        opcode.
        """
        if not opcode_table[self.opcode].get('can_be_wide'):
            return False

        if self.operands[0].value >= 255:
            return True

        if self.opcode == 0x84:
            if self.operands[1].value >= 255:
                return True

        return False

    @property
    def name(self):
        """Alias for mnemonic."""
        return self.mnemonic

    @property
    def details(self):
        """Extended opcode information."""
        return opcode_table[self.opcode]

[docs]    @classmethod
    def create(cls, mnemonic_or_op, operands=None):
        op = opcode_table[mnemonic_or_op]

        return cls(
            op['mnemonic'],
            op['op'],
            operands or [],
            0
        )

    def __eq__(self, other):
        return other == self.mnemonic or super().__eq__(other)


[docs]class OperandTypes(enum.IntEnum):
    """
    Constants used to determine the "type" of operand on an opcode,
    such as a BRANCH [offset] or a LITERAL [value].
    """
    LITERAL = 10
    LOCAL_INDEX = 20
    CONSTANT_INDEX = 30
    BRANCH = 40
    PADDING = 50


[docs]class OperandFmts(enum.Enum):
    UBYTE = Struct('>B')
    BYTE = Struct('>b')
    USHORT = Struct('>H')
    SHORT = Struct('>h')
    INTEGER = Struct('>i')


[docs]def write_instruction(fout, start_pos, ins):
    """
    Writes a single instruction of `opcode` with `operands` to `fout`.

    :param fout: Any file-like object providing ``write()``.
    :param start_pos: The current position in the stream.
    :param ins: The `Instruction` to write.
    """
    opcode, operands = ins.opcode, ins.operands
    fmt_operands = opcode_table[opcode]['operands']

    if ins.wide:
        # The "WIDE" prefix
        fout.write(pack('>B', 0xC4))
        # The real opcode.
        fout.write(pack('>B', opcode))
        fout.write(pack('>H', operands[0].value))
        if opcode == 0x84:
            fout.write(pack('>h', operands[1].value))
    elif fmt_operands:
        # A normal simple opcode with simple operands.
        fout.write(pack('>B', opcode))
        for i, (fmt, _) in enumerate(fmt_operands):
            fout.write(fmt.value.pack(operands[i].value))
    elif opcode == 0xAB:
        # Special case for lookupswitch.
        fout.write(pack('>B', opcode))
        # assemble([
        #     ('lookupswitch', {
        #         2: -3,
        #         4: 5
        #     }, <default>)
        # ])
        padding = 4 - (start_pos + 1) % 4
        padding = padding if padding != 4 else 0
        fout.write(pack(f'{padding}x'))
        fout.write(pack('>ii', operands[1].value, len(operands[0])))
        for key in sorted(operands[0].keys()):
            fout.write(pack('>ii', key, operands[0][key]))
    elif opcode == 0xAA:
        # Special case for table switch.
        fout.write(pack('>B', opcode))
        padding = 4 - (start_pos + 1) % 4
        padding = padding if padding != 4 else 0
        fout.write(pack(f'{padding}x'))
        fout.write(pack(
            f'>iii{len(operands) - 3}i',
            # Default branch offset
            operands[0].value,
            operands[1].value,
            operands[2].value,
            *(o.value for o in operands[3:])
        ))
    else:
        # opcode with no operands.
        fout.write(pack('>B', opcode))


[docs]def read_instruction(fio, start_pos):
    """
    Reads a single instruction from `fio` and returns it, or ``None`` if
    the stream is empty.

    :param fio: Any file-like object providing ``read()``.
    :param start_pos: The current position in the stream.
    """
    op = fio.read(1)

    if not op:
        return None

    op = ord(op)

    ins = opcode_table[op]
    operands = ins['operands']
    name = ins['mnemonic']

    final_operands = []
    # Most opcodes have simple operands.
    if operands:
        for fmt, type_ in operands:
            final_operands.append(
                Operand(
                    type_,
                    fmt.value.unpack(fio.read(fmt.value.size))[0]
                )
            )
    # Special case for lookupswitch.
    elif op == 0xAB:
        # Get rid of the alignment padding.
        padding = 4 - (start_pos + 1) % 4
        padding = padding if padding != 4 else 0
        fio.read(padding)

        # Default branch address and branch count.
        default, npairs = unpack('>ii', fio.read(8))

        pairs = {}
        for _ in repeat(None, npairs):
            match, offset = unpack('>ii', fio.read(8))
            pairs[match] = offset

        final_operands.append(pairs)
        final_operands.append(Operand(OperandTypes.BRANCH, default))
    # Special case for tableswitch
    elif op == 0xAA:
        # Get rid of the alignment padding.
        padding = 4 - (start_pos + 1) % 4
        padding = padding if padding != 4 else 0
        fio.read(padding)

        default, low, high = unpack('>iii', fio.read(12))
        final_operands.append(Operand(OperandTypes.BRANCH, default))
        final_operands.append(Operand(OperandTypes.LITERAL, low))
        final_operands.append(Operand(OperandTypes.LITERAL, high))

        for _ in repeat(None, high - low + 1):
            offset = unpack('>i', fio.read(4))[0]
            final_operands.append(Operand(OperandTypes.BRANCH, offset))
    # Special case for the wide prefix
    elif op == 0xC4:
        real_op = unpack('>B', fio.read(1))[0]
        ins = opcode_table[real_op]
        name = ins['mnemonic']

        final_operands.append(Operand(
            OperandTypes.LOCAL_INDEX,
            unpack('>H', fio.read(2))[0]
        ))
        # Further special case for iinc.
        if real_op == 0x84:
            final_operands.append(Operand(
                OperandTypes.LITERAL,
                unpack('>H', fio.read(2))[0]
            ))

    return Instruction(name, op, final_operands, start_pos)


[docs]def load_bytecode_definitions(*, path=None) -> dict:
    """Load bytecode definitions from JSON file.

    If no path is provided the default bytecode.json will be loaded.

    :param path: Either None or a path to a JSON file to load containing
                 bytecode definitions.
    """
    if path is not None:
        with open(path, 'rb') as file_in:
            j = json.load(file_in)
    else:
        try:
            j = json.loads(pkgutil.get_data('jawa.util', 'bytecode.json'))
        except json.JSONDecodeError:
            # Unfortunately our best way to handle missing/malformed/empty
            # bytecode.json files since it may not actually be backed by a
            # "real" file.
            return {}

    for definition in j.values():
        # If the entry has any operands take the text labels and convert
        # them into pre-cached struct objects and operand types.
        operands = definition['operands']
        if operands:
            definition['operands'] = [
                [getattr(OperandFmts, oo[0]), OperandTypes[oo[1]]]
                for oo in operands
            ]

    # Return one dict that contains both mnemonic keys and opcode keys.
    return {**j, **{v['op']: v for v in j.values()}}


opcode_table = load_bytecode_definitions()