diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2026-03-27 22:30:44 +0100 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2026-03-27 22:30:44 +0100 |
| commit | 6c6ba5489586b6458980fc988736d7fb1be44f30 (patch) | |
| tree | fed1a9144a9a62b819a8a7d7ac7716b66dc37f47 | |
| parent | 0138af2472dfdef0d56fc4697416eaa0ff2589bd (diff) | |
| parent | 0a28e06575b3f3b30c1e99fc08fa0907956f35a4 (diff) | |
Merge tag 'kvm-s390-master-7.0-2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
KVM: s390: More memory management fixes
Lots of small and not-so-small fixes for the newly rewritten gmap,
mostly affecting the handling of nested guests.
| -rw-r--r-- | arch/s390/kvm/dat.c | 100 | ||||
| -rw-r--r-- | arch/s390/kvm/dat.h | 23 | ||||
| -rw-r--r-- | arch/s390/kvm/gaccess.c | 71 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.c | 160 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.h | 33 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.c | 18 | ||||
| -rw-r--r-- | arch/s390/kvm/vsie.c | 4 |
7 files changed, 231 insertions, 178 deletions
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 670404d4fa44..7b8d70fe406d 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -135,32 +135,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt } /** - * dat_crstep_xchg() - Exchange a gmap CRSTE with another. - * @crstep: Pointer to the CRST entry - * @new: Replacement entry. - * @gfn: The affected guest address. - * @asce: The ASCE of the address space. - * - * Context: This function is assumed to be called with kvm->mmu_lock held. - */ -void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) -{ - if (crstep->h.i) { - WRITE_ONCE(*crstep, new); - return; - } else if (cpu_has_edat2()) { - crdte_crste(crstep, *crstep, new, gfn, asce); - return; - } - - if (machine_has_tlb_guest()) - idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); - else - idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); - WRITE_ONCE(*crstep, new); -} - -/** * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. * @crstep: Pointer to the CRST entry. * @old: Expected old value. @@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce * * Return: %true if the exchange was successful. */ -bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, - union asce asce) +bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, + gfn_t gfn, union asce asce) { if (old.h.i) return arch_try_cmpxchg((long *)crstep, &old.val, new.val); @@ -292,6 +266,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g pt->ptes[i].val = init.val | i * PAGE_SIZE; /* No need to take locks as the page table is not installed yet. */ pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.vsie_notif = old.s.fc1.vsie_notif; pgste_init.pcl = uses_skeys && init.h.i; dat_init_pgstes(pt, pgste_init.val); } else { @@ -893,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d /* This table entry needs to be updated. */ if (walk->start <= gfn && walk->end >= next) { - dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) + return -EINVAL; /* A lower level table was present, needs to be freed. */ if (!crste.h.fc && !crste.h.i) { if (is_pmd(crste)) @@ -1021,67 +997,21 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; } -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f) -{ - union crste oldval, newval; - union pte newpte, oldpte; - union pgste pgste; - int rc = 0; - - rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); - if (rc == -EINVAL || rc == -ENOMEM) - return rc; - if (rc) - return -EAGAIN; - - if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) - return -EINVAL; - - if (f->ptep) { - pgste = pgste_get_lock(f->ptep); - oldpte = *f->ptep; - newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); - newpte.s.sd = oldpte.s.sd; - oldpte.s.sd = 0; - if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { - pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); - if (f->callback) - f->callback(f); - } else { - rc = -EAGAIN; - } - pgste_set_unlock(f->ptep, pgste); - } else { - oldval = READ_ONCE(*f->crstep); - newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, - f->write_attempt | oldval.s.fc1.d); - newval.s.fc1.sd = oldval.s.fc1.sd; - if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && - crste_origin_large(oldval) != crste_origin_large(newval)) - return -EAGAIN; - if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) - return -EAGAIN; - if (f->callback) - f->callback(f); - } - - return rc; -} - static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - union crste crste = READ_ONCE(*crstep); + union crste newcrste, oldcrste; int *n = walk->priv; - if (!crste.h.fc || crste.h.i || crste.h.p) - return 0; - + do { + oldcrste = READ_ONCE(*crstep); + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) + return 0; + if (oldcrste.s.fc1.prefix_notif) + break; + newcrste = oldcrste; + newcrste.s.fc1.prefix_notif = 1; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); *n = 2; - if (crste.s.fc1.prefix_notif) - return 0; - crste.s.fc1.prefix_notif = 1; - dat_crstep_xchg(crstep, crste, gfn, walk->asce); return 0; } diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h index 123e11dcd70d..874cc962e196 100644 --- a/arch/s390/kvm/dat.h +++ b/arch/s390/kvm/dat.h @@ -160,14 +160,14 @@ union pmd { unsigned long :44; /* HW */ unsigned long : 3; /* Unused */ unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable soft-bit */ unsigned long r : 1; /* Readable soft-bit */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; /* Unused */ unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -183,14 +183,14 @@ union pud { unsigned long :33; /* HW */ unsigned long :14; /* Unused */ unsigned long : 1; /* HW */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable soft-bit */ unsigned long r : 1; /* Readable soft-bit */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; /* Unused */ unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -254,14 +254,14 @@ union crste { struct { unsigned long :47; unsigned long : 1; /* HW (should be 0) */ + unsigned long s : 1; /* Special */ unsigned long w : 1; /* Writable */ unsigned long r : 1; /* Readable */ unsigned long d : 1; /* Dirty */ unsigned long y : 1; /* Young */ - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long : 3; /* HW */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ unsigned long vsie_notif : 1; /* Referenced in a shadow table */ - unsigned long : 1; unsigned long : 4; /* HW */ unsigned long sd : 1; /* Soft-Dirty */ unsigned long pr : 1; /* Present */ @@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf u16 type, u16 param); int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); -int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, - bool uses_skeys, struct guest_fault *f); int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); long dat_reset_cmma(union asce asce, gfn_t start_gfn); @@ -938,11 +936,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); } -static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) +static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) { - union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); - dat_crstep_xchg(crstep, newcrste, gfn, asce); + do { + oldcrste = READ_ONCE(*crstep); + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); + return oldcrste; } static inline int get_level(union crste *crstep, union pte *ptep) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a9da9390867d..53a8550e7102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1436,13 +1436,21 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union if (!pgste_get_trylock(ptep_h, &pgste)) return -EAGAIN; - newpte = _pte(f->pfn, f->writable, !p, 0); - newpte.s.d |= ptep->s.d; - newpte.s.sd |= ptep->s.sd; - newpte.h.p &= ptep->h.p; - pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); - pgste.vsie_notif = 1; + newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s); + newpte.s.d |= ptep_h->s.d; + newpte.s.sd |= ptep_h->s.sd; + newpte.h.p &= ptep_h->h.p; + if (!newpte.h.p && !f->writable) { + rc = -EOPNOTSUPP; + } else { + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + } pgste_set_unlock(ptep_h, pgste); + if (rc) + return rc; + if (!sg->parent) + return -EAGAIN; newpte = _pte(f->pfn, 0, !p, 0); if (!pgste_get_trylock(ptep, &pgste)) @@ -1456,7 +1464,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, struct guest_fault *f, bool p) { - union crste newcrste; + union crste newcrste, oldcrste; gfn_t gfn; int rc; @@ -1469,16 +1477,28 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni if (rc) return rc; - newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); - newcrste.s.fc1.d |= host->s.fc1.d; - newcrste.s.fc1.sd |= host->s.fc1.sd; - newcrste.h.p &= host->h.p; - newcrste.s.fc1.vsie_notif = 1; - newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; - _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); + do { + /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */ + if (!sg->parent) + return -EAGAIN; + oldcrste = READ_ONCE(*host); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); + newcrste.s.fc1.d |= oldcrste.s.fc1.d; + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; + newcrste.h.p &= oldcrste.h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; + newcrste.s.fc1.s = oldcrste.s.fc1.s; + if (!newcrste.h.p && !f->writable) + return -EOPNOTSUPP; + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); + if (!sg->parent) + return -EAGAIN; - newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); - dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); + gfn = gpa_to_gfn(raddr); + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + ; return 0; } @@ -1502,21 +1522,31 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, if (rc) return rc; - /* A race occourred. The shadow mapping is already valid, nothing to do */ - if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) + /* A race occurred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) return 0; gl = get_level(table, ptep); + /* In case of a real address space */ + if (w->level <= LEVEL_MEM) { + l = TABLE_TYPE_PAGE_TABLE; + hl = TABLE_TYPE_REGION1; + goto real_address_space; + } + /* * Skip levels that are already protected. For each level, protect * only the page containing the entry, not the whole table. */ for (i = gl ; i >= w->level; i--) { - rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), - entries[i - 1].pfn, i, entries[i - 1].writable); + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), + entries[i].pfn, i + 1, entries[i].writable); if (rc) return rc; + if (!sg->parent) + return -EAGAIN; } rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, @@ -1528,6 +1558,7 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, /* Get the smallest granularity */ l = min3(gl, hl, w->level); +real_address_space: flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); /* If necessary, create the shadow mapping */ if (l < gl) { diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index ef0c6ebfdde2..645c32c767d2 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st struct clear_young_pte_priv *priv = walk->priv; union crste crste, new; - crste = READ_ONCE(*crstep); + do { + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) + break; - if (!crste.h.fc) - return 0; - if (!crste.s.fc1.y && crste.h.i) - return 0; - if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { new = crste; new.h.i = 1; new.s.fc1.y = 0; @@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st folio_set_dirty(phys_to_folio(crste_origin_large(crste))); new.s.fc1.d = 0; new.h.p = 1; - dat_crstep_xchg(crstep, new, gfn, walk->asce); - } + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); + priv->young = 1; return 0; } @@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct { struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; + union crste old = *crstep; - if (crstep->h.fc) { - if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) - folio = phys_to_folio(crste_origin_large(*crstep)); - gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); - if (folio) - uv_convert_from_secure_folio(folio); - } + if (!old.h.fc) + return 0; + + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(old)); + /* No races should happen because kvm->mmu_lock is held in write mode */ + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), + priv->gmap->kvm); + if (folio) + uv_convert_from_secure_folio(folio); return 0; } @@ -474,23 +481,24 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t if (fatal_signal_pending(current)) return 1; - crste = READ_ONCE(*table); - if (!crste.h.fc) - return 0; - if (crste.h.p && !crste.s.fc1.sd) - return 0; + do { + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; - /* - * If this large page contains one or more prefixes of vCPUs that are - * currently running, do not reset the protection, leave it marked as - * dirty. - */ - if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) + break; new = crste; new.h.p = 1; new.s.fc1.sd = 0; - gmap_crstep_xchg(gmap, table, new, gfn); - } + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); for ( ; gfn < end; gfn++) mark_page_dirty(gmap->kvm, gfn); @@ -511,7 +519,7 @@ void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); } -static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) +static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) { union crste newcrste, oldcrste = READ_ONCE(*f->crstep); @@ -536,10 +544,8 @@ static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) newcrste.s.fc1.d = 1; newcrste.s.fc1.sd = 1; } - if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) - SetPageDirty(phys_to_page(crste_origin_large(newcrste))); /* In case of races, let the slow path deal with it. */ - return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); + return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); } /* Trying to write on a read-only page, let the slow path deal with it. */ return 1; @@ -568,8 +574,6 @@ static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, newpte.s.d = 1; newpte.s.sd = 1; } - if (!oldpte.s.d && newpte.s.d) - SetPageDirty(pfn_to_page(newpte.h.pfra)); *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); return 0; @@ -606,7 +610,7 @@ int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) fault->callback(fault); pgste_set_unlock(fault->ptep, pgste); } else { - rc = gmap_handle_minor_crste_fault(gmap->asce, fault); + rc = gmap_handle_minor_crste_fault(gmap, fault); if (!rc && fault->callback) fault->callback(fault); } @@ -623,10 +627,61 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); } +static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, + struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, + &f->crstep, &f->ptep); + if (rc == -ENOMEM) + return rc; + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) + return rc; + if (rc) + return -EAGAIN; + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + do { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.s = !f->page; + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); + if (f->callback) + f->callback(f); + } + + return rc; +} + int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) { unsigned int order; - int rc, level; + int level; lockdep_assert_held(&gmap->kvm->mmu_lock); @@ -638,16 +693,14 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) level = TABLE_TYPE_SEGMENT; } - rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); - KVM_BUG_ON(rc == -EINVAL, gmap->kvm); - return rc; + return _gmap_link(mc, gmap, level, f); } static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) { + union crste newcrste, oldcrste; struct page_table *pt; - union crste newcrste; union crste *crstep; union pte *ptep; int rc; @@ -673,7 +726,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, &crstep, &ptep); if (rc) return rc; - dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); + do { + oldcrste = READ_ONCE(*crstep); + if (oldcrste.val == newcrste.val) + break; + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); return 0; } @@ -777,8 +834,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) int rc; rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); - if (!rc) - dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); + if (rc) + return; + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) + ; } void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) @@ -1017,8 +1076,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); return; } - crste = READ_ONCE(*crstep); - dat_crstep_clear(crstep, r_gfn, sg->asce); + + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); if (crste_leaf(crste) || crste.h.i) return; if (is_pmd(crste)) @@ -1101,6 +1160,7 @@ struct gmap_protect_asce_top_level { static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, struct gmap_protect_asce_top_level *context) { + struct gmap *parent; int rc, i; guard(write_lock)(&sg->kvm->mmu_lock); @@ -1108,7 +1168,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) return -EAGAIN; - scoped_guard(spinlock, &sg->parent->children_lock) { + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; for (i = 0; i < CRST_TABLE_PAGES; i++) { if (!context->f[i].valid) continue; @@ -1191,6 +1256,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare struct gmap *sg, *new; int rc; + if (WARN_ON(!parent)) + return ERR_PTR(-EINVAL); + scoped_guard(spinlock, &parent->children_lock) { sg = gmap_find_shadow(parent, asce, edat_level); if (sg) { diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index ccb5cd751e31..579399ef5480 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -185,6 +185,8 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un else _gmap_handle_vsie_unshadow_event(gmap, gfn); } + if (!ptep->s.d && newpte.s.d && !newpte.s.s) + SetPageDirty(pfn_to_page(newpte.h.pfra)); return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); } @@ -194,35 +196,42 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); } -static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, - gfn_t gfn, bool needs_lock) +static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn, bool needs_lock) { - unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; + + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) + return true; lockdep_assert_held(&gmap->kvm->mmu_lock); if (!needs_lock) lockdep_assert_held(&gmap->children_lock); gfn = ALIGN_DOWN(gfn, align); - if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { - ne.s.fc1.prefix_notif = 0; + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { + newcrste.s.fc1.prefix_notif = 0; gmap_unmap_prefix(gmap, gfn, gfn + align); } - if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && - (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { - ne.s.fc1.vsie_notif = 0; + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { + newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); } - dat_crstep_xchg(crstep, ne, gfn, gmap->asce); + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); } -static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, - gfn_t gfn) +static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, + union crste oldcrste, union crste newcrste, + gfn_t gfn) { - return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); } /** diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b2c01fa7b852..d7838334a338 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5520,9 +5520,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - idx = srcu_read_lock(&vcpu->kvm->srcu); - r = vcpu_dat_fault_handler(vcpu, arg, 0); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + gpa_t gaddr = arg; + + scoped_guard(srcu, &vcpu->kvm->srcu) { + r = vcpu_ucontrol_translate(vcpu, &gaddr); + if (r) + break; + + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); + if (r == PGM_ADDRESSING) + r = -EFAULT; + if (r <= 0) + break; + r = -EIO; + KVM_BUG_ON(r, vcpu->kvm); + } break; } case KVM_ENABLE_CAP: diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 0330829b4046..72895dddc39a 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1328,7 +1328,7 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - struct gmap *sg; + struct gmap *sg = NULL; int rc = 0; while (1) { @@ -1368,6 +1368,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) sg = gmap_put(sg); cond_resched(); } + if (sg) + sg = gmap_put(sg); if (rc == -EFAULT) { /* |
