drivers/gpu/arm/t6xx/kbase/src/common/mali_kbase_js_affinity.c

   1 /*
   2  *
   3  * (C) COPYRIGHT ARM Limited. All rights reserved.
   4  *
   5  * This program is free software and is provided to you under the terms of the
   6  * GNU General Public License version 2 as published by the Free Software
   7  * Foundation, and any use by you of this program is subject to the terms
   8  * of such GNU licence.
   9  *
  10  * A copy of the licence is included with the program, and can also be obtained
  11  * from Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  12  * Boston, MA  02110-1301, USA.
  13  *
  14  */
  15
  16
  17
  18
  19
  20 /**
  21  * @file mali_kbase_js_affinity.c
  22  * Base kernel affinity manager APIs
  23  */
  24
  25 #include <kbase/src/common/mali_kbase.h>
  26 #include "mali_kbase_js_affinity.h"
  27
  28 #if defined(CONFIG_MALI_DEBUG) && 0     /* disabled to avoid compilation warnings */
  29
  30 STATIC void debug_get_binary_string(const u64 n, char *buff, const int size)
  31 {
  32         unsigned int i;
  33         for (i = 0; i < size; i++)
  34                 buff[i] = ((n >> i) & 1) ? '*' : '-';
  35
  36         buff[size] = '\0';
  37 }
  38
  39 #define N_CORES 8
  40 STATIC void debug_print_affinity_info(const kbase_device *kbdev, const kbase_jd_atom *katom, int js, u64 affinity)
  41 {
  42         char buff[N_CORES + 1];
  43         char buff2[N_CORES + 1];
  44         base_jd_core_req core_req = katom->atom->core_req;
  45         u64 shader_present_bitmap = kbdev->shader_present_bitmap;
  46
  47         debug_get_binary_string(shader_present_bitmap, buff, N_CORES);
  48         debug_get_binary_string(affinity, buff2, N_CORES);
  49
  50         KBASE_DEBUG_PRINT_INFO(KBASE_JM, "Job: COH FS  CS   T  CF   V  JS | GPU:12345678 | AFF:12345678");
  51         KBASE_DEBUG_PRINT_INFO(KBASE_JM, "      %s   %s   %s   %s   %s   %s   %u |     %s |     %s", core_req & BASE_JD_REQ_COHERENT_GROUP ? "*" : "-", core_req & BASE_JD_REQ_FS ? "*" : "-", core_req & BASE_JD_REQ_CS ? "*" : "-", core_req & BASE_JD_REQ_T ? "*" : "-", core_req & BASE_JD_REQ_CF ? "*" : "-", core_req & BASE_JD_REQ_V ? "*" : "-", js, buff, buff2);
  52 }
  53
  54 #endif                          /* CONFIG_MALI_DEBUG */
  55
  56 STATIC INLINE mali_bool affinity_job_uses_high_cores(kbase_device *kbdev, kbase_jd_atom *katom)
  57 {
  58         if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8987)) {
  59                 kbase_context *kctx;
  60                 kbase_context_flags ctx_flags;
  61
  62                 kctx = katom->kctx;
  63                 ctx_flags = kctx->jctx.sched_info.ctx.flags;
  64
  65                 /* In this HW Workaround, compute-only jobs/contexts use the high cores
  66                  * during a core-split, all other contexts use the low cores. */
  67                 return (mali_bool) ((katom->core_req & BASE_JD_REQ_ONLY_COMPUTE) != 0 || (ctx_flags & KBASE_CTX_FLAG_HINT_ONLY_COMPUTE) != 0);
  68         }
  69         return MALI_FALSE;
  70 }
  71
  72 /**
  73  * @brief Decide whether a split in core affinity is required across job slots
  74  *
  75  * The following locking conditions are made on the caller:
  76  * - it must hold kbasep_js_device_data::runpool_irq::lock
  77  *
  78  * @param kbdev The kbase device structure of the device
  79  * @return MALI_FALSE if a core split is not required
  80  * @return != MALI_FALSE if a core split is required.
  81  */
  82 STATIC INLINE mali_bool kbase_affinity_requires_split(kbase_device *kbdev)
  83 {
  84         KBASE_DEBUG_ASSERT(kbdev != NULL);
  85         lockdep_assert_held(&kbdev->js_data.runpool_irq.lock);
  86
  87         if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8987)) {
  88                 s8 nr_compute_ctxs = kbasep_js_ctx_attr_count_on_runpool(kbdev, KBASEP_JS_CTX_ATTR_COMPUTE);
  89                 s8 nr_noncompute_ctxs = kbasep_js_ctx_attr_count_on_runpool(kbdev, KBASEP_JS_CTX_ATTR_NON_COMPUTE);
  90
  91                 /* In this case, a mix of Compute+Non-Compute determines whether a
  92                  * core-split is required, to ensure jobs with different numbers of RMUs
  93                  * don't use the same cores.
  94                  *
  95                  * When it's entirely compute, or entirely non-compute, then no split is
  96                  * required.
  97                  *
  98                  * A context can be both Compute and Non-compute, in which case this will
  99                  * correctly decide that a core-split is required. */
 100
 101                 return (mali_bool) (nr_compute_ctxs > 0 && nr_noncompute_ctxs > 0);
 102         }
 103         return MALI_FALSE;
 104 }
 105
 106 mali_bool kbase_js_can_run_job_on_slot_no_lock(kbase_device *kbdev, int js)
 107 {
 108         /*
 109          * Here are the reasons for using job slot 2:
 110          * - BASE_HW_ISSUE_8987 (which is entirely used for that purpose)
 111          * - In absence of the above, then:
 112          *  - Atoms with BASE_JD_REQ_COHERENT_GROUP
 113          *  - But, only when there aren't contexts with
 114          *  KBASEP_JS_CTX_ATTR_COMPUTE_ALL_CORES, because the atoms that run on
 115          *  all cores on slot 1 could be blocked by those using a coherent group
 116          *  on slot 2
 117          *  - And, only when you actually have 2 or more coregroups - if you only
 118          *  have 1 coregroup, then having jobs for slot 2 implies they'd also be
 119          *  for slot 1, meaning you'll get interference from them. Jobs able to
 120          *  run on slot 2 could also block jobs that can only run on slot 1
 121          *  (tiler jobs)
 122          */
 123         if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8987))
 124                 return MALI_TRUE;
 125
 126         if (js != 2)
 127                 return MALI_TRUE;
 128
 129         /* Only deal with js==2 now: */
 130         if (kbdev->gpu_props.num_core_groups > 1) {
 131                 /* Only use slot 2 in the 2+ coregroup case */
 132                 if (kbasep_js_ctx_attr_is_attr_on_runpool(kbdev, KBASEP_JS_CTX_ATTR_COMPUTE_ALL_CORES) == MALI_FALSE) {
 133                         /* ...But only when we *don't* have atoms that run on all cores */
 134
 135                         /* No specific check for BASE_JD_REQ_COHERENT_GROUP atoms - the policy will sort that out */
 136                         return MALI_TRUE;
 137                 }
 138         }
 139
 140         /* Above checks failed mean we shouldn't use slot 2 */
 141         return MALI_FALSE;
 142 }
 143
 144 #ifdef AFFINITY_MASK_ENABLE
 145 static u64 affinity_mask = 0xFFFF;
 146 static spinlock_t affinity_mask_spinlock;
 147 static int init_once;
 148 void kbase_js_set_affinity_mask(u64 mask)
 149 {
 150         unsigned long flags;
 151         if (!init_once) {
 152                 spin_lock_init(&affinity_mask_spinlock);
 153                 init_once = 1;
 154         }
 155         if (mask) {
 156                 spin_lock_irqsave(&affinity_mask_spinlock, flags);
 157                 affinity_mask = mask;
 158                 spin_unlock_irqrestore(&affinity_mask_spinlock, flags);
 159                 pr_info("affinity mask is set to 0x%x.\n", (u32)mask);
 160         } else {
 161                 pr_info("Invalid mask!!!\n");
 162         }
 163         return;
 164 }
 165
 166 u64 kbase_js_get_affinity_mask(void)
 167 {
 168         u64 val;
 169         unsigned long flags;
 170         if (!init_once) {
 171                 spin_lock_init(&affinity_mask_spinlock);
 172                 init_once = 1;
 173         }
 174         spin_lock_irqsave(&affinity_mask_spinlock, flags);
 175         val = affinity_mask;
 176         spin_unlock_irqrestore(&affinity_mask_spinlock, flags);
 177
 178         return val;
 179 }
 180 #endif
 181
 182
 183 /*
 184  * As long as it has been decided to have a deeper modification of
 185  * what job scheduler, power manager and affinity manager will
 186  * implement, this function is just an intermediate step that
 187  * assumes:
 188  * - all working cores will be powered on when this is called.
 189  * - largest current configuration is a T658 (2x4 cores).
 190  * - It has been decided not to have hardcoded values so the low
 191  *   and high cores in a core split will be evently distributed.
 192  * - Odd combinations of core requirements have been filtered out
 193  *   and do not get to this function (e.g. CS+T+NSS is not
 194  *   supported here).
 195  * - This function is frequently called and can be optimized,
 196  *   (see notes in loops), but as the functionallity will likely
 197  *   be modified, optimization has not been addressed.
 198 */
 199 mali_bool kbase_js_choose_affinity(u64 * const affinity, kbase_device *kbdev, kbase_jd_atom *katom, int js)
 200 {
 201         base_jd_core_req core_req = katom->core_req;
 202         unsigned int num_core_groups = kbdev->gpu_props.num_core_groups;
 203         u64 core_availability_mask;
 204         unsigned long flags;
 205
 206         spin_lock_irqsave(&kbdev->pm.power_change_lock, flags);
 207
 208         core_availability_mask = kbase_pm_ca_get_core_mask(kbdev);
 209
 210         /*
 211          * If no cores are currently available (core availability policy is
 212          * transitioning) then fail.
 213          */
 214         if (0 == core_availability_mask)
 215         {
 216                 spin_unlock_irqrestore(&kbdev->pm.power_change_lock, flags);
 217                 *affinity = 0;
 218                 return MALI_FALSE;
 219         }
 220
 221         KBASE_DEBUG_ASSERT(js >= 0);
 222
 223         if ((core_req & (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T)) == BASE_JD_REQ_T)
 224         {
 225                 spin_unlock_irqrestore(&kbdev->pm.power_change_lock, flags);
 226                 /* Tiler only job, bit 0 needed to enable tiler but no shader cores required */
 227                 *affinity = 1;
 228                 return MALI_TRUE;
 229         }
 230
 231         if (1 == kbdev->gpu_props.num_cores) {
 232                 /* trivial case only one core, nothing to do */
 233                 *affinity = core_availability_mask;
 234         } else if (kbase_affinity_requires_split(kbdev) == MALI_FALSE) {
 235                 if ((core_req & (BASE_JD_REQ_COHERENT_GROUP | BASE_JD_REQ_SPECIFIC_COHERENT_GROUP))) {
 236                         if (js == 0 || num_core_groups == 1) {
 237                                 /* js[0] and single-core-group systems just get the first core group */
 238                                 *affinity = kbdev->gpu_props.props.coherency_info.group[0].core_mask & core_availability_mask;
 239                         } else {
 240                                 /* js[1], js[2] use core groups 0, 1 for dual-core-group systems */
 241                                 u32 core_group_idx = ((u32) js) - 1;
 242                                 KBASE_DEBUG_ASSERT(core_group_idx < num_core_groups);
 243                                 *affinity = kbdev->gpu_props.props.coherency_info.group[core_group_idx].core_mask & core_availability_mask;
 244
 245                                 /* If the job is specifically targeting core group 1 and the core
 246                                  * availability policy is keeping that core group off, then fail */
 247                                 if (*affinity == 0 && core_group_idx == 1 && kbdev->pm.cg1_disabled == MALI_TRUE)
 248                                         katom->event_code = BASE_JD_EVENT_PM_EVENT;
 249                         }
 250                 } else {
 251                         /* All cores are available when no core split is required */
 252                         *affinity = core_availability_mask;
 253                 }
 254         } else {
 255                 /* Core split required - divide cores in two non-overlapping groups */
 256                 u64 low_bitmap, high_bitmap;
 257                 int n_high_cores = kbdev->gpu_props.num_cores >> 1;
 258                 KBASE_DEBUG_ASSERT(1 == num_core_groups);
 259                 KBASE_DEBUG_ASSERT(0 != n_high_cores);
 260
 261                 /* compute the reserved high cores bitmap */
 262                 high_bitmap = ~0;
 263                 /* note: this can take a while, optimization desirable */
 264                 while (n_high_cores != hweight32(high_bitmap & kbdev->shader_present_bitmap))
 265                         high_bitmap = high_bitmap << 1;
 266
 267                 high_bitmap &= core_availability_mask;
 268                 low_bitmap = core_availability_mask ^ high_bitmap;
 269
 270                 if (affinity_job_uses_high_cores(kbdev, katom))
 271                         *affinity = high_bitmap;
 272                 else
 273                         *affinity = low_bitmap;
 274         }
 275
 276         spin_unlock_irqrestore(&kbdev->pm.power_change_lock, flags);
 277
 278         /*
 279          * If no cores are currently available in the desired core group(s)
 280          * (core availability policy is transitioning) then fail.
 281          */
 282         if (*affinity == 0)
 283                 return MALI_FALSE;
 284
 285         /* Enable core 0 if tiler required */
 286         if (core_req & BASE_JD_REQ_T)
 287                 *affinity = *affinity | 1;
 288
 289         return MALI_TRUE;
 290 }
 291
 292 STATIC INLINE mali_bool kbase_js_affinity_is_violating(kbase_device *kbdev, u64 *affinities)
 293 {
 294         /* This implementation checks whether the two slots involved in Generic thread creation
 295          * have intersecting affinity. This is due to micro-architectural issues where a job in
 296          * slot A targetting cores used by slot B could prevent the job in slot B from making
 297          * progress until the job in slot A has completed.
 298          *
 299          * @note It just so happens that this restriction also allows
 300          * BASE_HW_ISSUE_8987 to be worked around by placing on job slot 2 the
 301          * atoms from ctxs with KBASE_CTX_FLAG_HINT_ONLY_COMPUTE flag set
 302          */
 303         u64 affinity_set_left;
 304         u64 affinity_set_right;
 305         u64 intersection;
 306         KBASE_DEBUG_ASSERT(affinities != NULL);
 307
 308         affinity_set_left = affinities[1];
 309
 310         if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8987)) {
 311                 /* The left set also includes those on the Fragment slot when
 312                  * we are using the HW workaround for BASE_HW_ISSUE_8987 */
 313                 affinity_set_left |= affinities[0];
 314         }
 315
 316         affinity_set_right = affinities[2];
 317
 318         /* A violation occurs when any bit in the left_set is also in the right_set */
 319         intersection = affinity_set_left & affinity_set_right;
 320
 321         return (mali_bool) (intersection != (u64) 0u);
 322 }
 323
 324 mali_bool kbase_js_affinity_would_violate(kbase_device *kbdev, int js, u64 affinity)
 325 {
 326         kbasep_js_device_data *js_devdata;
 327         u64 new_affinities[BASE_JM_MAX_NR_SLOTS];
 328
 329         KBASE_DEBUG_ASSERT(kbdev != NULL);
 330         KBASE_DEBUG_ASSERT(js < BASE_JM_MAX_NR_SLOTS);
 331         js_devdata = &kbdev->js_data;
 332
 333         memcpy(new_affinities, js_devdata->runpool_irq.slot_affinities, sizeof(js_devdata->runpool_irq.slot_affinities));
 334
 335         new_affinities[js] |= affinity;
 336
 337         return kbase_js_affinity_is_violating(kbdev, new_affinities);
 338 }
 339
 340 void kbase_js_affinity_retain_slot_cores(kbase_device *kbdev, int js, u64 affinity)
 341 {
 342         kbasep_js_device_data *js_devdata;
 343         u64 cores;
 344
 345         KBASE_DEBUG_ASSERT(kbdev != NULL);
 346         KBASE_DEBUG_ASSERT(js < BASE_JM_MAX_NR_SLOTS);
 347         js_devdata = &kbdev->js_data;
 348
 349         KBASE_DEBUG_ASSERT(kbase_js_affinity_would_violate(kbdev, js, affinity) == MALI_FALSE);
 350
 351         cores = affinity;
 352         while (cores) {
 353                 int bitnum = fls64(cores) - 1;
 354                 u64 bit = 1ULL << bitnum;
 355                 s8 cnt;
 356
 357                 KBASE_DEBUG_ASSERT(js_devdata->runpool_irq.slot_affinity_refcount[js][bitnum] < BASE_JM_SUBMIT_SLOTS);
 358
 359                 cnt = ++(js_devdata->runpool_irq.slot_affinity_refcount[js][bitnum]);
 360
 361                 if (cnt == 1)
 362                         js_devdata->runpool_irq.slot_affinities[js] |= bit;
 363
 364                 cores &= ~bit;
 365         }
 366
 367 }
 368
 369 void kbase_js_affinity_release_slot_cores(kbase_device *kbdev, int js, u64 affinity)
 370 {
 371         kbasep_js_device_data *js_devdata;
 372         u64 cores;
 373
 374         KBASE_DEBUG_ASSERT(kbdev != NULL);
 375         KBASE_DEBUG_ASSERT(js < BASE_JM_MAX_NR_SLOTS);
 376         js_devdata = &kbdev->js_data;
 377
 378         cores = affinity;
 379         while (cores) {
 380                 int bitnum = fls64(cores) - 1;
 381                 u64 bit = 1ULL << bitnum;
 382                 s8 cnt;
 383
 384                 KBASE_DEBUG_ASSERT(js_devdata->runpool_irq.slot_affinity_refcount[js][bitnum] > 0);
 385
 386                 cnt = --(js_devdata->runpool_irq.slot_affinity_refcount[js][bitnum]);
 387
 388                 if (0 == cnt)
 389                         js_devdata->runpool_irq.slot_affinities[js] &= ~bit;
 390
 391                 cores &= ~bit;
 392         }
 393
 394 }
 395
 396 void kbase_js_affinity_slot_blocked_an_atom(kbase_device *kbdev, int js)
 397 {
 398         kbasep_js_device_data *js_devdata;
 399
 400         KBASE_DEBUG_ASSERT(kbdev != NULL);
 401         KBASE_DEBUG_ASSERT(js < BASE_JM_MAX_NR_SLOTS);
 402         js_devdata = &kbdev->js_data;
 403
 404         js_devdata->runpool_irq.slots_blocked_on_affinity |= 1u << js;
 405 }
 406
 407 void kbase_js_affinity_submit_to_blocked_slots(kbase_device *kbdev)
 408 {
 409         kbasep_js_device_data *js_devdata;
 410         u16 slots;
 411
 412         KBASE_DEBUG_ASSERT(kbdev != NULL);
 413         js_devdata = &kbdev->js_data;
 414
 415         KBASE_DEBUG_ASSERT(js_devdata->nr_user_contexts_running != 0);
 416
 417         /* Must take a copy because submitting jobs will update this member. */
 418         slots = js_devdata->runpool_irq.slots_blocked_on_affinity;
 419
 420         while (slots) {
 421                 int bitnum = fls(slots) - 1;
 422                 u16 bit = 1u << bitnum;
 423                 slots &= ~bit;
 424
 425                 KBASE_TRACE_ADD_SLOT(kbdev, JS_AFFINITY_SUBMIT_TO_BLOCKED, NULL, NULL, 0u, bitnum);
 426
 427                 /* must update this before we submit, incase it's set again */
 428                 js_devdata->runpool_irq.slots_blocked_on_affinity &= ~bit;
 429
 430                 kbasep_js_try_run_next_job_on_slot_nolock(kbdev, bitnum);
 431
 432                 /* Don't re-read slots_blocked_on_affinity after this - it could loop for a long time */
 433         }
 434 }
 435
 436 #if KBASE_TRACE_ENABLE != 0
 437 void kbase_js_debug_log_current_affinities(kbase_device *kbdev)
 438 {
 439         kbasep_js_device_data *js_devdata;
 440         int slot_nr;
 441
 442         KBASE_DEBUG_ASSERT(kbdev != NULL);
 443         js_devdata = &kbdev->js_data;
 444
 445         for (slot_nr = 0; slot_nr < 3; ++slot_nr)
 446                 KBASE_TRACE_ADD_SLOT_INFO(kbdev, JS_AFFINITY_CURRENT, NULL, NULL, 0u, slot_nr, (u32) js_devdata->runpool_irq.slot_affinities[slot_nr]);
 447 }
 448 #endif                          /* KBASE_TRACE_ENABLE != 0 */