Web lists-archives.com

Re: perf Intel x86_64 : BUG: BRANCH_INSTRUCTIONS / BRANCH_MISSES cannot be combined with CACHE_REFERENCES / CACHE_MISSES .




On 13/02/2018, Jason Vas Dias <jason.vas.dias@xxxxxxxxx> wrote:
> Good day -
>
> I'd much appreciate some advice as to why, on my Intel x86_64
> ( DisplayFamily_DisplayModel : 06_3CH ), running either Linux 4.12.10,
> or Linux 3.10.0, any attempt to count all of :
>      PERF_COUNT_HW_BRANCH_INSTRUCTIONS
>           (or raw config 0xC4) , and
>      PERF_COUNT_HW_BRANCH_MISSES
>           (or raw config 0xC5), and
>      combined with
>      PERF_COUNT_HW_CACHE_REFERENCES
>          (or raw config 0x4F2E ), and
>      PERF_COUNT_HW_CACHE_MISSES
>          (or raw config 0x412E) ,
> results in ALL COUNTERS BEING 0 in a read of the Group FD or
> mmap sample area.
>
> This is demonstrated by the example program, which will
> use perf_event_open() to create a Group Leader FD  for the first event,
> and associate all other events with that Event Group , so that it
> will read all events on the group FD .
>
> The perf_event_open() calls and the ioctl(event_fd, PERF_EVENT_IOC_ID, &id)
> calls all return successfully , but if I combine ANY of
> ( PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
>   PERF_COUNT_HW_BRANCH_MISSES
> ) with any of
> ( PERF_COUNT_HW_CACHE_REFERENCES,
>   PERF_COUNT_HW_CACHE_MISSES
> ) in the Event Group, ALL events have '0' event->value.
>
> Demo :
> 1. Compile program to use kernel mapped Generic Events:
>   $ gcc -std=gnu11 -o perf_bug perf_bug.c
>   Running program shows all counters have 0 values, since both
>   CACHE & BRANCH hits+misses are being requested:
>
>   $ ./perf_bug
>   EVENT: Branch Instructions : 0
>   EVENT: Branch Misses : 0
>   EVENT: Instructions : 0
>   EVENT: CPU Cycles : 0
>   EVENT: Ref. CPU Cycles : 0
>   EVENT: Bus Cycles : 0
>   EVENT: Cache References : 0
>   EVENT: Cache Misses : 0
>
>   NOT registering interest in EITHER the BRANCH counters
>   OR the CACHE counters fixes the problem:
>
>   Compile without registering for BRANCH_INSTRUCTIONS
>   or BRANCH_MISSES:
>   $ gcc -std=gnu11 -DNO_BUG_NO_BRANCH  -o perf_bug perf_bug.c
>   $ ./perf_bug
>   EVENT: Instructions : 914
>   EVENT: CPU Cycles : 4110
>   EVENT: Ref. CPU Cycles : 4437
>   EVENT: Bus Cycles : 152
>   EVENT: Cache References : 1
>   EVENT: Cache Misses : 1
>
>   Compile without registering for CACHE_REFERENCES or CACHE_MISSES:
>   $ gcc -std=gnu11 -DNO_BUG_NO_CACHE  -o perf_bug perf_bug.c
>   $ ./perf_bug
> EVENT: Branch Instructions : 106
> EVENT: Branch Misses : 6
> EVENT: Instructions : 914
> EVENT: CPU Cycles : 4132
> EVENT: Ref. CPU Cycles : 8526
> EVENT: Bus Cycles : 295
>
> The same thing happens if I do not use Generic Events, but rather
> "dynamic raw PMU" events, by putting the hex values from
> /sys/bus/event_source/devices/cpu/events/? into the perf_event_attr
> config, OR'ed with (1<<63), and using the PERF_TYPE_RAW perf_event_attr
> type value :
>
> $ gcc -DUSE_RAW_PMU -o perf_bug perf_bug.c
> $ ./perf_bug
> EVENT: Branch Instructions : 0
> EVENT: Branch Misses : 0
> EVENT: Instructions : 0
> EVENT: CPU Cycles : 0
> EVENT: Ref. CPU Cycles : 0
> EVENT: Bus Cycles : 0
> EVENT: Cache References : 0
> EVENT: Cache Misses : 0
>
>
> $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_BRANCH -o perf_bug perf_bug.c
> $ ./perf_bug
> EVENT: Instructions : 914
> EVENT: CPU Cycles : 4102
> EVENT: Ref. CPU Cycles : 4959
> EVENT: Bus Cycles : 171
> EVENT: Cache References : 2
> EVENT: Cache Misses : 2
>
> $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_CACHE -o perf_bug perf_bug.c
> $ ./perf_bug
> EVENT: Branch Instructions : 106
> EVENT: Branch Misses : 6
> EVENT: Instructions : 914
> EVENT: CPU Cycles : 4108
> EVENT: Ref. CPU Cycles : 10817
> EVENT: Bus Cycles : 373
>
>
> The perf tool itself seems to have the same issue:
>
> With CACHE & BRANCH counters does not work :
> $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep
> 1
>
>  Performance counter stats for 'sleep 1':
>
>      <not counted>      r0c4
>                (0.00%)
>      <not counted>      r0c5
>                (0.00%)
>      <not counted>      r0c0
>                (0.00%)
>      <not counted>      r03c
>                (0.00%)
>      <not counted>      r0300
>                (0.00%)
>      <not counted>      r013c
>                (0.00%)
>      <not counted>      r04F2E
>                (0.00%)
>    <not supported>     r0412E
>
>        1.001652932 seconds time elapsed
>
>    Some events weren't counted. Try disabling the NMI watchdog:
> 	echo 0 > /proc/sys/kernel/nmi_watchdog
> 	perf stat ...
> 	echo 1 > /proc/sys/kernel/nmi_watchdog
>
> Disabling the NMI watchdog makes no difference .
>
> It is very strange that perf thinks 'r0412E' is not supported :
>    $ cat /sys/bus/event_source/devices/cpu/cache_misses
>    event=0x2e,umask=0x41
>
> The kernel should not be advertizing an unsupported event
> in a  /sys/bus/event_source/devices/cpu/events/ file, should it ?
>
> So perf stat has the same problem - without either Cache or Branch
> counters seems to work fine:
>
> without cache:
> $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c}:SIu' sleep 1
>
>  Performance counter stats for 'sleep 1':
>
>              37740      r0c4
>               3557      r0c5
>             188552      r0c0
>             311684      r03c
>             360963      r0300
>              12461      r013c
>
>        1.001508109 seconds time elapsed
>
> without branch:
> $ perf stat -e '{r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep 1
>
>  Performance counter stats for 'sleep 1':
>
>             188554      r0c0
>             320242      r03c
>             452748      r0300
>              15633      r013c
>               4145      r04F2E
>               3022      r0412E
>
>        1.001810421 seconds time elapsed
>
> proving again that perf's claim that 'r0412E' is not supported is bogus.
> The Intel SDM's table 19-1 Architectural events, which ALL Intel CPUs
> are meant to support, does include  'Event: 2EH | Umask: 4FH : LLC
> Reference '  and  'Event: 2EH | Umask: 41H : LLC Miss' , as well as :
> 'Event : C4H | Umask: 00H : Branch Instructions Retired' and
> 'Event : C5H | Umask: 00H : Branch Misses Retired' .
> So why can't perf count them all in the same group?
>
> Please , can anyone enlighten me as to what is going on here ?
>
> Why can't I count all of
>    ( BRANCH_INSTRUCTIONS , BRANCH_MISSES ,
>      CACHE_REFERENCES, CACHE_MISSES
>   )
> in the same Perf Event Group ?
>
> Thanks in advance for any replies,
> Best Regards,
> Jason
>

Actually, it appears that ONLY the combination of
'BRANCH_MISSES' and 'CACHE_MISSES' makes
all sampled counter values 0 ; if either counter is
not requested, all other counters have non-zero values.
 I've updated the program to reflect this .

And nmi_watchdog=1 DOES make a difference  :
if  nmi_watchdog is > 0 , then ANY combination
of {BRANCH,CACHE}_{REFS,MISSES} makes ALL
sampled counter values be 0, but if nmi_watchdog is 0,
then only the combination of BRANCH_MISSES and
CACHE_MISSES makes all sampled values be 0 .

This is a really nasty bug and makes the Linux PERF facility
rather unusable ; because of this bug, Linux PERF provides
no way of measuring cache and branch prediction performance
at the same time for the same instruction sequence.

Can anyone suggest a valid reason for this ?
Or any workarounds ?

If no-one suggests a workaround or valid reason
I guess I should raise this as a serious bug .

Thanks & Regards,
Jason
/* Demonstration of Linux PERF bug:
 *  Linux is unable to count BRANCH_INSTRUCTIONS or BRANCH_MISSES 
 *  at the same time as CACHE_REFERENCES or CACHE_MISSES.
 */

#include <sys/types.h>
#include <stdint.h>
#include <stdbool.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <linux/perf_event.h>

static int
perf_event_open
( struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
{ int ret;  
  ret = (int) syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
  return ret;
}

int main( int argc, const char *const* argv)
{ struct perf_event_attr pea = {0};
  struct evcfg
  { uint64_t perf_type;
    uint64_t perf_cfg;
    const char *name;
    int fd;
    uint64_t id;
  } pe [] = {
#ifndef USE_RAW_PMU
  // so we can test using Generic Kernel Event mapping:
#ifndef NO_BUG_NO_BRANCH
#ifndef NO_BUG_NO_BRANCH_INST    
    { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_BRANCH_INSTRUCTIONS
    , "Branch Instructions"
    , -1, 0
    }
#endif
#ifndef NO_BUG_NO_BRANCH_MISS    
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_BRANCH_MISSES
    , "Branch Misses"
    , -1, 0
    }
#endif    
  ,
#endif    
    { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_INSTRUCTIONS
    , "Instructions"
    , -1, 0
    }    
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_CPU_CYCLES
    , "CPU Cycles"
    , -1, 0
    }
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_REF_CPU_CYCLES
    , "Ref. CPU Cycles"
    , -1, 0
    } 
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_BUS_CYCLES
    , "Bus Cycles"      
    , -1, 0      
    }
#ifndef NO_BUG_NO_CACHE
#ifndef NO_BUG_NO_CACHE_REF    
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_CACHE_REFERENCES
    , "Cache References"            
    , -1, 0
    }
#endif    
#ifndef NO_BUG_NO_CACHE_MISS
  , { PERF_TYPE_HARDWARE 
    , PERF_COUNT_HW_CACHE_MISSES
    , "Cache Misses"                  
    , -1, 0      
    }
#endif   
#endif    
#else
  // or test using raw PMU codes - these come from the Intel SDM, Chapter 19, Table 19-1,
  // and I've checked they are identical to the values in
  // /sys/bus/event_source/devices/cpu/events/{
#ifndef NO_BUG_NO_BRANCH
#ifndef NO_BUG_NO_BRANCH_INST        
    { PERF_TYPE_RAW 
    , (1UL<<63U) | 0xC4   // branch_instructions
    , "Branch Instructions"
    , -1, 0      
    }
#endif
#ifndef NO_BUG_NO_BRANCH_MISS       
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0xC5   // branch_misses
    , "Branch Misses"      
    , -1, 0      
    }
#endif
  ,
#endif
    { PERF_TYPE_RAW 
    , (1UL<<63U) | 0xC0   // instructions
    , "Instructions"
    , -1, 0      
    }    
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0x3C   // cpu cycles
    , "CPU Cycles"      
    , -1, 0      
    }
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0x0300  // ref cpu cycles
    , "Ref. CPU Cycles"            
    , -1, 0           
    }
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0x013C  // bus cycles
    , "Bus Cycles"            
    , -1, 0      
    }
#ifndef NO_BUG_NO_CACHE
#ifndef NO_BUG_NO_CACHE_REF        
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0x04F2E  // cache references
    , "Cache References" 
    , -1, 0           
    }
#endif
#ifndef NO_BUG_NO_CACHE_MISS            
  , { PERF_TYPE_RAW 
    , (1UL<<63U) | 0x0412E  // cache misses
    , "Cache Misses"       
    , -1, 0      
    }
#endif    
#endif
#endif
  };
#define N_EV (sizeof(pe)/sizeof(struct evcfg))
  int fd=-1;
  int n_ev=0;
  pid_t pid=getpid();
  for(; n_ev < N_EV; n_ev += 1)
  { memset(&pea, '\0', sizeof(pea));
    pea.size   = PERF_ATTR_SIZE_VER5;
    pea.type   = pe[n_ev].perf_type;
    pea.config = pe[n_ev].perf_cfg;
    pea.read_format = PERF_FORMAT_GROUP  | PERF_FORMAT_ID |
                      PERF_FORMAT_TOTAL_TIME_ENABLED  |
                      PERF_FORMAT_TOTAL_TIME_RUNNING  ;
    pea.disabled=1;
    pea.exclude_kernel = 1;
    pea.exclude_idle = 1;
    pea.exclude_hv = 1;
    
    if((pe[n_ev].fd =
        perf_event_open ( &pea, pid, -1, fd, 0)
       ) == -1
     )
    { fprintf(stderr,"perf_event_open failed : %d : '%s'.\n", errno, strerror(errno));
      return 1;
    }
 
    if( fd == -1)
      fd = pe[n_ev].fd; // this is the Group Leader FD
    
    if( 0 != ioctl( pe[n_ev].fd, PERF_EVENT_IOC_ID, &pe[n_ev].id))
    { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed for #%d : %d : '%s'.\n", n_ev, errno, strerror(errno));
      return 1;
    }    
  }
  
  if( 0 != ioctl( fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP))
  { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_RESET) failed : %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }
  
  // do something to measure - let's try 100 long divisions:
  uint64_t a_num = 0x0102030405060708;
  uint64_t b_num = ~a_num;
  int cnt=100;
  if( 0 != ioctl( fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP))
  { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }
  
  do
  { a_num=(b_num /= a_num);
  } while(--cnt);
  
  if( 0 != ioctl( fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP))
  { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }  

  struct
  { uint64_t nr, time_enabled, time_running;
    struct event
    { uint64_t value,id;
    } ev[N_EV];
  } events;

  if( read(fd, &events, sizeof(events)) != sizeof(events))
  { fprintf(stderr,"read of event group leader FD failed : %d : '%s'.\n", errno, strerror(errno));
    return 1;
  }

  if( events.nr != N_EV )
  { fprintf(stderr,"unexpected number of events read: %lu\n", events.nr);
    return 1;
  }
  struct event *ev = &events.ev[0];
  bool non_zero_event=false;
  do
  { bool found=0;
    for(n_ev=0; n_ev < N_EV; n_ev += 1)
    { if( pe[n_ev].id == ev->id )
      { found = true;
        break;
      }
    }
    if( ! found )
    { fprintf(stderr,"Kernel returned unknown event ID: %lu", ev->id);
      return 1;
    }
    printf("EVENT: %s : %lu\n", pe[n_ev].name, ev->value);
    if (!non_zero_event) 
      non_zero_event = ev->value != 0;
    ++ev;
  } while( --events.nr );
  return (non_zero_event ? 0 : 1);
}