develooper Front page | perl.perl5.porters | Postings from October 2014

[perl #122872] [PATCH] fix SV body docs in sv.c & PURIFY, cleanup body API, add U32PBYTE macros

From:
bulk88 via RT
Date:
October 11, 2014 02:37
Subject:
[perl #122872] [PATCH] fix SV body docs in sv.c & PURIFY, cleanup body API, add U32PBYTE macros
Message ID:
rt-4.0.18-15650-1412995031-807.122872-15-0@perl.org
On Sat Oct 04 00:58:36 2014, bulk88 wrote:
> I will revise it so the internals of how to assign a type to an SV
> head is hidden and move the assert to somewhere else.

New patch attached. Details of setting the type of an SV were placed behind an api/macro.

On ARM, setting SV type was 4 instructions previously, now it is 1 instruction after the patch.
The ARM code from EVC 4 -01 this area of code, I am not sure if
"new_type_details = bodies_by_type + new_type;" is in the branch or optimizer moved it somewhere else
-------------------------------------------------
    new_type_details = bodies_by_type + new_type;

    SvTYPE_set_mem(sv, new_type);

    /* This can't happen, as SVt_NULL is <= all values of new_type, so one of
       the return statements above will have triggered.  */
    assert (new_type != SVt_NULL);
    switch (new_type) {
    case SVt_IV:
-------------------------------------------------
*/star next to instruction address means sv type setting related instructions

old
-------------------------------------------------
280CAFF0                 MOV     LR, #0          ; U32 null = 0x00000000;
.........
;start of branch
280CB0E4*                 STRB    LR, [R11,#8]    ; *(U8*)(R11+sv_flags) = (U8)null;
280CB0E8                 MOV     R3, R7,LSL#3
280CB0EC*                 LDR     R0, [R11,#8]    ;  U32  old_sv_flags  = *(U32*)(R11+sv_flags);
280CB0F0                 SUB     R2, R7, #1
280CB0F4                 CMP     R2, #0xE
280CB0F8*                 ORR     R1, R0, R7      ; new_sv_flags = old_sv_flags | new_type;
280CB0FC                 ADD     R6, R3, R8
280CB100*                 STR     R1, [R11,#8]    ; *(U32*)(R11+sv_flags) = new_sv_flags;
280CB104                 BHI     loc_280CB428    ; jump if higher than unsigned
280CB108                 MOV     R0, R2,LSL#1
280CB10C                 ADD     R0, R0, PC
280CB110                 LDRH    R0, [R0,#4]     ; R0 = *(U16*)(R0+4);
280CB114                 ADD     PC, PC, R0      ;instruction_pointer += R0; //jump table
280CB118                 ; an inline in machine code U16 array lives here for the jump table
.............
280CB428 loc_280CB428:
280CB428                 LDR     R0, ="panic: sv_upgrade to unknown type %lu"
280CB42C                 MOV     R1, R7          ; arg_2 = new_type;
280CB430                 BL      Perl_croak
-------------------------------------------------
new
-------------------------------------------------
;start of branch
280CB0EC                 MOV     R2, R7,LSL#3
280CB0F0*                 STRB    R7, [R11,#8]    ; store byte in R7(new_type) to *(U8*)(R11+sv_flags)
280CB0F4                 SUB     R1, R7, #1
280CB0F8                 AND     R0, R7, #0xFF   ; PURPOSE UNKNOWN, not in old asm, R0's val never used again compiler bug?
280CB0FC                 ADD     R6, R2, R8
280CB100                 CMP     R1, #0xE
280CB104                 BHI     loc_280CB44C    ;jump if higher than unsigned
280CB108                 MOV     R0, R1,LSL#1    ; R0 is wiped
280CB10C                 ADD     R0, R0, PC
280CB110                 LDRH    R0, [R0,#4]     ; R0 = *(U16*)(R0+4);
280CB114                 ADD     PC, PC, R0      ; instruction_pointer += R0; //jump table
280CB118                 ; an inline in machine code U16 array lives here for the jump table
.............
280CB44C loc_280CB44C:
280CB44C                 LDR     R0, ="panic: sv_upgrade to unknown type %lu" ; R0 is wiped
280CB450                 MOV     R1, R7          ; arg_2 = new_type;
280CB454                 BL      Perl_croak
-------------------------------------------------

In any case, how is STRB implemented on ARM? Obviously I dont have access to a commercial ARM core's source code, so a FOSS ARM CPU must do.

------------------------------------------------
assign store_op             = mem_op && !instruction[20];
................................................
        // Load & Store instructions
        if ( mem_op )
            begin
            saved_current_instruction_wen   = 1'd1; // Save the memory access instruction to refer back to later
            pc_wen_nxt                      = 1'd0; // hold current PC value
            data_access_exec_nxt            = 1'd1; // indicate that its a data read or write, 
                                                    // rather than an instruction fetch
            alu_out_sel_nxt                 = 4'd1; // Add
            
            if ( !instruction[23] )  // U: Subtract offset
                begin
                alu_cin_sel_nxt  = 2'd1; // cin = 1
                alu_not_sel_nxt  = 1'd1; // invert B
                end
//bulk88 says bit 20 is difference between load and store, that was set earlier
            if ( store_op )
                begin
                write_data_wen_nxt = 1'd1;
 //bulk88 says bit 22 in ARM op is byte or 32 bit flag, if 1, then this is byte ARM op
//bulk88 says TRANS the ARM instruction prefix that says this is a load/store class op
//bulk88 says STR, STRB, LDR, LDRB all have TRANS prefix
                if ( type == TRANS && instruction[22] )
                    byte_enable_sel_nxt = 2'd1;         // Save byte
                end
                
                // need to update the register holding the address ?
                // This is Rn bits [19:16]
//bulk88 says pre-indexed/post-indexed is a feature where in a load/store addr reg operand is changed during the op
//bulk88 says this makes incrementing a array pointer during a loop be part of the array deref op
//bulk88 says can be "reg_dst = *(reg_src = reg_src+const_offset)"
//bulk88 says can be "reg_dst = *((reg_invisible = reg_src), (reg_src = reg_src+const_offset), (reg_invisible))"
//bulk88 says or normal idea of a move "reg_dst = *(reg_src+const_offset)"
            if ( mem_op_pre_indexed || mem_op_post_indexed )
                begin
                // Check is the load destination is the PC
                if ( o_rn_sel_nxt  == 4'd15 )
                    pc_sel_nxt = 2'd1; 
                else                     
                    reg_bank_wsel_nxt = o_rn_sel_nxt;
                end
                
                // if post-indexed, then use Rn rather than ALU output, as address
            if ( mem_op_post_indexed )
               address_sel_nxt = 4'd4; // Rn
            else   
               address_sel_nxt = 4'd1; // alu out
               
            if ( instruction[25] && type ==  TRANS )
                barrel_shift_data_sel_nxt = 2'd2; // Shift value from Rm register
                
            if ( type == TRANS && instruction[25] && shift_imm != 5'd0 ) 
                begin   
                barrel_shift_function_nxt   = instruction[6:5];
                barrel_shift_amount_sel_nxt = 2'd2; // imm_shift_amount
                end
            end
.....................................
wire [31:0]                 write_data_word;
......................................
input      [3:0]                    i_byte_enable,
......................................
wire [CACHE_LINE_WIDTH-1:0] write_hit_wdata;
......................................
wire [31:0]         rd;
......................................
// bulk88 says, this takes lowest 2 bits for U8s, or 2nd to lowest bit for U16s, of 
// the output address, and picks a concat opcode to use later
// Note, this statement supports U16 internal to CPU opcode, but this CPU doesn't 
// support LDRH or STRH machine code instructions since
// this is a ARM V2 CPU, U16 support was added in ARM V4. Orig comments ahead.
// ========================================================
// Byte Enable Select
// ========================================================
assign byte_enable_nxt = i_byte_enable_sel == 2'd0  ? 4'b1111 :  // word write
                         i_byte_enable_sel == 2'd2  ?            // halfword write
                         ( o_address_nxt[1] == 1'd0 ? 4'b0011 : 
                                                      4'b1100  ) :
                           
                         o_address_nxt[1:0] == 2'd0 ? 4'b0001 :  // byte write
                         o_address_nxt[1:0] == 2'd1 ? 4'b0010 :
                         o_address_nxt[1:0] == 2'd2 ? 4'b0100 :
                                                      4'b1000 ;

//bulk88 says this statment says, if its a byte op, take lowest byte, copy it to all 4 bytes of the word
//bulk88 says this later allows the concating to work, in that no matter which byte is selected of the 4
//bulk88 says bytes to be spliced into the output U32 word, they will all be the same, note this statment
//bulk88 says is not short/U16 compatible
// ========================================================
// Write Data Select
// ========================================================
assign write_data_nxt = i_byte_enable_sel == 2'd0 ? rd            :
                                                    {4{rd[ 7:0]}} ;
......................................
//bulk88 says {} is concat operator, this expression picks then concats/fuses 
//the U8s, U16s or U32 that make up the output U32, using the opcode from above
//NOTICE it does not use "&" then "|" to get the job done the way Perl was doing it. Orig comments ahead.
// Use Byte Enables
assign write_data_word  = i_byte_enable == 4'b0001 ? { o_read_data[31: 8], i_write_data[ 7: 0]                   } :
                          i_byte_enable == 4'b0010 ? { o_read_data[31:16], i_write_data[15: 8], o_read_data[ 7:0]} :
                          i_byte_enable == 4'b0100 ? { o_read_data[31:24], i_write_data[23:16], o_read_data[15:0]} :
                          i_byte_enable == 4'b1000 ? {                     i_write_data[31:24], o_read_data[23:0]} :
                          i_byte_enable == 4'b0011 ? { o_read_data[31:16], i_write_data[15: 0]                   } :
                          i_byte_enable == 4'b1100 ? {                     i_write_data[31:16], o_read_data[15:0]} :
                                                     i_write_data                                                  ;
.....................................
assign write_hit_wdata  = i_address[3:2] == 2'd0 ? {hit_rdata[127:32], write_data_word                   } :
                          i_address[3:2] == 2'd1 ? {hit_rdata[127:64], write_data_word, hit_rdata[31:0]  } :
                          i_address[3:2] == 2'd2 ? {hit_rdata[127:96], write_data_word, hit_rdata[63:0]  } :
                                                   {                   write_data_word, hit_rdata[95:0]  } ;
.....................................
    // Data comes in off the WB bus in wrap4 with the missed data word first
assign data_wdata       = write_hit && c_state == CS_IDLE ? write_hit_wdata : read_miss_wdata;
------------------------------------------------

So "STRB" is implemented in hardware, not extra opcodes.
algorithm is if byte write, take src register, take low byte, copy to bytes 1-3 of src register, take address,
use low 2 bits to figure out alignment, then splice as 2 32 bit values, then splice 32 bit value into 16 byte cache line

-- 
bulk88 ~ bulk88 at hotmail.com

---
via perlbug:  queue: perl5 status: open
https://rt.perl.org/Ticket/Display.html?id=122872



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About