Front page | perl.perl5.porters |
Postings from October 2014
[perl #122872] [PATCH] fix SV body docs in sv.c & PURIFY, cleanup body API, add U32PBYTE macros
From:
bulk88 via RT
Date:
October 11, 2014 02:37
Subject:
[perl #122872] [PATCH] fix SV body docs in sv.c & PURIFY, cleanup body API, add U32PBYTE macros
Message ID:
rt-4.0.18-15650-1412995031-807.122872-15-0@perl.org
On Sat Oct 04 00:58:36 2014, bulk88 wrote:
> I will revise it so the internals of how to assign a type to an SV
> head is hidden and move the assert to somewhere else.
New patch attached. Details of setting the type of an SV were placed behind an api/macro.
On ARM, setting SV type was 4 instructions previously, now it is 1 instruction after the patch.
The ARM code from EVC 4 -01 this area of code, I am not sure if
"new_type_details = bodies_by_type + new_type;" is in the branch or optimizer moved it somewhere else
-------------------------------------------------
new_type_details = bodies_by_type + new_type;
SvTYPE_set_mem(sv, new_type);
/* This can't happen, as SVt_NULL is <= all values of new_type, so one of
the return statements above will have triggered. */
assert (new_type != SVt_NULL);
switch (new_type) {
case SVt_IV:
-------------------------------------------------
*/star next to instruction address means sv type setting related instructions
old
-------------------------------------------------
280CAFF0 MOV LR, #0 ; U32 null = 0x00000000;
.........
;start of branch
280CB0E4* STRB LR, [R11,#8] ; *(U8*)(R11+sv_flags) = (U8)null;
280CB0E8 MOV R3, R7,LSL#3
280CB0EC* LDR R0, [R11,#8] ; U32 old_sv_flags = *(U32*)(R11+sv_flags);
280CB0F0 SUB R2, R7, #1
280CB0F4 CMP R2, #0xE
280CB0F8* ORR R1, R0, R7 ; new_sv_flags = old_sv_flags | new_type;
280CB0FC ADD R6, R3, R8
280CB100* STR R1, [R11,#8] ; *(U32*)(R11+sv_flags) = new_sv_flags;
280CB104 BHI loc_280CB428 ; jump if higher than unsigned
280CB108 MOV R0, R2,LSL#1
280CB10C ADD R0, R0, PC
280CB110 LDRH R0, [R0,#4] ; R0 = *(U16*)(R0+4);
280CB114 ADD PC, PC, R0 ;instruction_pointer += R0; //jump table
280CB118 ; an inline in machine code U16 array lives here for the jump table
.............
280CB428 loc_280CB428:
280CB428 LDR R0, ="panic: sv_upgrade to unknown type %lu"
280CB42C MOV R1, R7 ; arg_2 = new_type;
280CB430 BL Perl_croak
-------------------------------------------------
new
-------------------------------------------------
;start of branch
280CB0EC MOV R2, R7,LSL#3
280CB0F0* STRB R7, [R11,#8] ; store byte in R7(new_type) to *(U8*)(R11+sv_flags)
280CB0F4 SUB R1, R7, #1
280CB0F8 AND R0, R7, #0xFF ; PURPOSE UNKNOWN, not in old asm, R0's val never used again compiler bug?
280CB0FC ADD R6, R2, R8
280CB100 CMP R1, #0xE
280CB104 BHI loc_280CB44C ;jump if higher than unsigned
280CB108 MOV R0, R1,LSL#1 ; R0 is wiped
280CB10C ADD R0, R0, PC
280CB110 LDRH R0, [R0,#4] ; R0 = *(U16*)(R0+4);
280CB114 ADD PC, PC, R0 ; instruction_pointer += R0; //jump table
280CB118 ; an inline in machine code U16 array lives here for the jump table
.............
280CB44C loc_280CB44C:
280CB44C LDR R0, ="panic: sv_upgrade to unknown type %lu" ; R0 is wiped
280CB450 MOV R1, R7 ; arg_2 = new_type;
280CB454 BL Perl_croak
-------------------------------------------------
In any case, how is STRB implemented on ARM? Obviously I dont have access to a commercial ARM core's source code, so a FOSS ARM CPU must do.
------------------------------------------------
assign store_op = mem_op && !instruction[20];
................................................
// Load & Store instructions
if ( mem_op )
begin
saved_current_instruction_wen = 1'd1; // Save the memory access instruction to refer back to later
pc_wen_nxt = 1'd0; // hold current PC value
data_access_exec_nxt = 1'd1; // indicate that its a data read or write,
// rather than an instruction fetch
alu_out_sel_nxt = 4'd1; // Add
if ( !instruction[23] ) // U: Subtract offset
begin
alu_cin_sel_nxt = 2'd1; // cin = 1
alu_not_sel_nxt = 1'd1; // invert B
end
//bulk88 says bit 20 is difference between load and store, that was set earlier
if ( store_op )
begin
write_data_wen_nxt = 1'd1;
//bulk88 says bit 22 in ARM op is byte or 32 bit flag, if 1, then this is byte ARM op
//bulk88 says TRANS the ARM instruction prefix that says this is a load/store class op
//bulk88 says STR, STRB, LDR, LDRB all have TRANS prefix
if ( type == TRANS && instruction[22] )
byte_enable_sel_nxt = 2'd1; // Save byte
end
// need to update the register holding the address ?
// This is Rn bits [19:16]
//bulk88 says pre-indexed/post-indexed is a feature where in a load/store addr reg operand is changed during the op
//bulk88 says this makes incrementing a array pointer during a loop be part of the array deref op
//bulk88 says can be "reg_dst = *(reg_src = reg_src+const_offset)"
//bulk88 says can be "reg_dst = *((reg_invisible = reg_src), (reg_src = reg_src+const_offset), (reg_invisible))"
//bulk88 says or normal idea of a move "reg_dst = *(reg_src+const_offset)"
if ( mem_op_pre_indexed || mem_op_post_indexed )
begin
// Check is the load destination is the PC
if ( o_rn_sel_nxt == 4'd15 )
pc_sel_nxt = 2'd1;
else
reg_bank_wsel_nxt = o_rn_sel_nxt;
end
// if post-indexed, then use Rn rather than ALU output, as address
if ( mem_op_post_indexed )
address_sel_nxt = 4'd4; // Rn
else
address_sel_nxt = 4'd1; // alu out
if ( instruction[25] && type == TRANS )
barrel_shift_data_sel_nxt = 2'd2; // Shift value from Rm register
if ( type == TRANS && instruction[25] && shift_imm != 5'd0 )
begin
barrel_shift_function_nxt = instruction[6:5];
barrel_shift_amount_sel_nxt = 2'd2; // imm_shift_amount
end
end
.....................................
wire [31:0] write_data_word;
......................................
input [3:0] i_byte_enable,
......................................
wire [CACHE_LINE_WIDTH-1:0] write_hit_wdata;
......................................
wire [31:0] rd;
......................................
// bulk88 says, this takes lowest 2 bits for U8s, or 2nd to lowest bit for U16s, of
// the output address, and picks a concat opcode to use later
// Note, this statement supports U16 internal to CPU opcode, but this CPU doesn't
// support LDRH or STRH machine code instructions since
// this is a ARM V2 CPU, U16 support was added in ARM V4. Orig comments ahead.
// ========================================================
// Byte Enable Select
// ========================================================
assign byte_enable_nxt = i_byte_enable_sel == 2'd0 ? 4'b1111 : // word write
i_byte_enable_sel == 2'd2 ? // halfword write
( o_address_nxt[1] == 1'd0 ? 4'b0011 :
4'b1100 ) :
o_address_nxt[1:0] == 2'd0 ? 4'b0001 : // byte write
o_address_nxt[1:0] == 2'd1 ? 4'b0010 :
o_address_nxt[1:0] == 2'd2 ? 4'b0100 :
4'b1000 ;
//bulk88 says this statment says, if its a byte op, take lowest byte, copy it to all 4 bytes of the word
//bulk88 says this later allows the concating to work, in that no matter which byte is selected of the 4
//bulk88 says bytes to be spliced into the output U32 word, they will all be the same, note this statment
//bulk88 says is not short/U16 compatible
// ========================================================
// Write Data Select
// ========================================================
assign write_data_nxt = i_byte_enable_sel == 2'd0 ? rd :
{4{rd[ 7:0]}} ;
......................................
//bulk88 says {} is concat operator, this expression picks then concats/fuses
//the U8s, U16s or U32 that make up the output U32, using the opcode from above
//NOTICE it does not use "&" then "|" to get the job done the way Perl was doing it. Orig comments ahead.
// Use Byte Enables
assign write_data_word = i_byte_enable == 4'b0001 ? { o_read_data[31: 8], i_write_data[ 7: 0] } :
i_byte_enable == 4'b0010 ? { o_read_data[31:16], i_write_data[15: 8], o_read_data[ 7:0]} :
i_byte_enable == 4'b0100 ? { o_read_data[31:24], i_write_data[23:16], o_read_data[15:0]} :
i_byte_enable == 4'b1000 ? { i_write_data[31:24], o_read_data[23:0]} :
i_byte_enable == 4'b0011 ? { o_read_data[31:16], i_write_data[15: 0] } :
i_byte_enable == 4'b1100 ? { i_write_data[31:16], o_read_data[15:0]} :
i_write_data ;
.....................................
assign write_hit_wdata = i_address[3:2] == 2'd0 ? {hit_rdata[127:32], write_data_word } :
i_address[3:2] == 2'd1 ? {hit_rdata[127:64], write_data_word, hit_rdata[31:0] } :
i_address[3:2] == 2'd2 ? {hit_rdata[127:96], write_data_word, hit_rdata[63:0] } :
{ write_data_word, hit_rdata[95:0] } ;
.....................................
// Data comes in off the WB bus in wrap4 with the missed data word first
assign data_wdata = write_hit && c_state == CS_IDLE ? write_hit_wdata : read_miss_wdata;
------------------------------------------------
So "STRB" is implemented in hardware, not extra opcodes.
algorithm is if byte write, take src register, take low byte, copy to bytes 1-3 of src register, take address,
use low 2 bits to figure out alignment, then splice as 2 32 bit values, then splice 32 bit value into 16 byte cache line
--
bulk88 ~ bulk88 at hotmail.com
---
via perlbug: queue: perl5 status: open
https://rt.perl.org/Ticket/Display.html?id=122872
-
[perl #122872] [PATCH] fix SV body docs in sv.c & PURIFY, cleanup body API, add U32PBYTE macros
by bulk88 via RT