From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS17314 8.43.84.0/22 X-Spam-Status: No, score=-4.2 required=3.0 tests=AWL,BAYES_00,DKIM_SIGNED, DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,MAILING_LIST_MULTI, MSGID_FROM_MTA_HEADER,RCVD_IN_DNSWL_HI,SPF_HELO_PASS,SPF_PASS, UNPARSEABLE_RELAY shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id D185E1F8C6 for ; Tue, 10 Aug 2021 09:45:57 +0000 (UTC) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 05EBD395382B for ; Tue, 10 Aug 2021 09:45:57 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 05EBD395382B DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1628588757; bh=x5XkVGd+EAFBDn5q2tDjhZuF+SUSDLGnflhAVhyXOVw=; h=Date:To:Subject:References:In-Reply-To:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:Cc: From; b=d4qIk2J9oyahY7ZLPHB+AJxT2zvDXHVcAsavHhme9ot7Yv+MCKO9Dx7bNsxSloz5H hQz73IAnNqO1J2NFOrahFt97o5D5TXl3gVVLmIrnABOkn1SknsxcS3OlL8BSWm9MR4 gSdjZMfKc/pxjybsUEyxh6tOhHTzvfnt+nPjLEPs= Received: from EUR01-VE1-obe.outbound.protection.outlook.com (mail-eopbgr140043.outbound.protection.outlook.com [40.107.14.43]) by sourceware.org (Postfix) with ESMTPS id 804A23953C37 for ; Tue, 10 Aug 2021 09:44:40 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 804A23953C37 Received: from DB7PR03CA0077.eurprd03.prod.outlook.com (2603:10a6:10:72::18) by AM7PR08MB5301.eurprd08.prod.outlook.com (2603:10a6:20b:dd::14) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4394.21; Tue, 10 Aug 2021 09:44:38 +0000 Received: from DB5EUR03FT045.eop-EUR03.prod.protection.outlook.com (2603:10a6:10:72:cafe::63) by DB7PR03CA0077.outlook.office365.com (2603:10a6:10:72::18) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4415.13 via Frontend Transport; Tue, 10 Aug 2021 09:44:38 +0000 X-MS-Exchange-Authentication-Results: spf=pass (sender IP is 63.35.35.123) smtp.mailfrom=arm.com; sourceware.org; dkim=pass (signature was verified) header.d=armh.onmicrosoft.com;sourceware.org; dmarc=pass action=none header.from=arm.com; Received-SPF: Pass (protection.outlook.com: domain of arm.com designates 63.35.35.123 as permitted sender) receiver=protection.outlook.com; client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com; Received: from 64aa7808-outbound-1.mta.getcheckrecipient.com (63.35.35.123) by DB5EUR03FT045.mail.protection.outlook.com (10.152.21.164) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4394.16 via Frontend Transport; Tue, 10 Aug 2021 09:44:38 +0000 Received: ("Tessian outbound efa8a7456a86:v101"); Tue, 10 Aug 2021 09:44:38 +0000 X-CheckRecipientChecked: true X-CR-MTA-CID: 42b271e0dbe3f9e4 X-CR-MTA-TID: 64aa7808 Received: from 3a96d47ecb6c.2 by 64aa7808-outbound-1.mta.getcheckrecipient.com id 7F81C20A-07BA-425F-86C8-C5784ADA2065.1; Tue, 10 Aug 2021 09:44:32 +0000 Received: from EUR02-VE1-obe.outbound.protection.outlook.com by 64aa7808-outbound-1.mta.getcheckrecipient.com with ESMTPS id 3a96d47ecb6c.2 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-GCM-SHA384); Tue, 10 Aug 2021 09:44:32 +0000 ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=Dcgv08iBX+bEdHdsp9yZYIKZgX9dT/sdnl0g4j80Mijn0GE7VsAHerOP7uL55WbsUkFu3lG26qcsYdiTbCzHF78lwrgqE24ExBWzy0FeStu/J9lplcO6begV/Lb8PRzSKGvJCCgARwagtc6qWcGzD9K44T7Q37QM/B8XkcWcfGdwWEKDfyMxnVEDxpuFjNvh2i5KKDIpcUqBBcjSXD7Takz0lwaElr9u+vP6jUXuwjZ7OVwf7j7+qygHucHdxRpxOIc8hZUJTRIwhhgAxc0yl9pioPEncYm95HjGlKCML8xj9jpDMbLHMZMoGvjupUUfw+QzCMyLPHtuTTIBznefbg== ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=x5XkVGd+EAFBDn5q2tDjhZuF+SUSDLGnflhAVhyXOVw=; b=W3afO34engL18TsC8HZ8V6kjh+9srVhSgizaHW3XhRD6RT+VrqGydL1anmnOTdKSZTsO3VV+slAcyzOdny0LribthBB60axsspw+lZCI/rYPGxwUDw3ZSTh2Ual2QlOc8GK1v+oyBzOP8ZgXLXfjSH5+u8fXefsubb+S3dOveAy0zow56BMUKpTHuZj9mR4bAR8bksZBqYcX1e1CXhJeSPLSzUQULbYfIKMEZ95NKC+k/Hag6uGt51zwvwMAb3z3OQ9m/jIVw8mEMcFlwVxYxS9hX9o6x+Vo1wSLNlApDSLI+vPt5ZPUVS13RXtrE0F7CPXLJOLhIpdP7uQuizxLmg== ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none Authentication-Results-Original: arm.com; dkim=none (message not signed) header.d=none;arm.com; dmarc=none action=none header.from=arm.com; Received: from PA4PR08MB6320.eurprd08.prod.outlook.com (2603:10a6:102:e5::9) by PAXPR08MB6445.eurprd08.prod.outlook.com (2603:10a6:102:159::13) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4394.15; Tue, 10 Aug 2021 09:44:31 +0000 Received: from PA4PR08MB6320.eurprd08.prod.outlook.com ([fe80::cd22:a583:c97c:72a6]) by PA4PR08MB6320.eurprd08.prod.outlook.com ([fe80::cd22:a583:c97c:72a6%7]) with mapi id 15.20.4415.013; Tue, 10 Aug 2021 09:44:31 +0000 Date: Tue, 10 Aug 2021 10:44:29 +0100 To: Wilco Dijkstra Subject: Re: [PATCH v4 5/5] AArch64: Improve A64FX memset medium loops Message-ID: <20210810094428.GG20410@arm.com> References: Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.9.4 (2018-02-28) X-ClientProxiedBy: LO2P265CA0254.GBRP265.PROD.OUTLOOK.COM (2603:10a6:600:8a::26) To PA4PR08MB6320.eurprd08.prod.outlook.com (2603:10a6:102:e5::9) MIME-Version: 1.0 X-MS-Exchange-MessageSentRepresentingType: 1 Received: from arm.com (217.140.106.49) by LO2P265CA0254.GBRP265.PROD.OUTLOOK.COM (2603:10a6:600:8a::26) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.20.4394.16 via Frontend Transport; Tue, 10 Aug 2021 09:44:30 +0000 X-MS-PublicTrafficType: Email X-MS-Office365-Filtering-Correlation-Id: 40dc973e-dd7c-48e0-0e50-08d95be3787a X-MS-TrafficTypeDiagnostic: PAXPR08MB6445:|AM7PR08MB5301: X-MS-Exchange-Transport-Forked: True X-Microsoft-Antispam-PRVS: x-checkrecipientrouted: true NoDisclaimer: true X-MS-Oob-TLC-OOBClassifiers: OLM:962;OLM:962; X-MS-Exchange-SenderADCheck: 1 X-MS-Exchange-AntiSpam-Relay: 0 X-Microsoft-Antispam-Untrusted: BCL:0; X-Microsoft-Antispam-Message-Info-Original: M/qHEST+rNG4cClw/ALpiq1GxG8cuM9tctw+QxMdx4FAg6GhBHaf7Q/sD1spNXl1Y9hw4WKMlQJdFv+pse8WJEV7ptnlESJ5bRwWVX4VzAyPkTwA8/2hik3vRtm4syVfS0rzfyX16i5SzqY15XzcjXVV4ewVqhx0WNE9rtaLwueVc3/GIzHkmytNkHVYWG6Qyg29jhJtNuya2qFofe06Egt3grvQ5K10+HLHXys/5rO7s4eOM9DqdKvuIRF7HrVrhcAcq6hq3zHrMGoStU0iXNXziuBgdvEeFaAQecGiCHLly/2Z7aTnqYz58CoVeLp6CU6kk5gEmaiLlDi7NuYEwqdnYXSuRv0j+s0XnlzeyRSXkE5dIIJla8+nqzvdrkBB/AcQC6Phf9SDtxUQftDWnXK90C04OH57VF5Ln8vNJGKpu78RXg6EgDQ9z5j/ldhsdeJhjjsjbheLR3L5dSFSUbyo9TLgXdl65aWByjBv8o5uP7SCyxiP4kcMqMSiOVP0eMzenTSYOfCip/mmQ2p7ivQHav0hxCpSfwDptbgfCNVVT7k1fEDKdSjlCehrBCfav7JuiGGIAV4c88Wa/1amblXNFQ0bBm1o3x9MvxBkBqiQGrfrtfSCYfrp3s9DAch3XPqE9ELUGz9IIicEy7aa5HWH7TRV+xxSp9RCKEjXlrlEF8++UUJtJswB145iEEBSbVZC2zWg7a+mEbceXBlAHYo/V+x08UIwmTsk83u4yAP6DRt9vFD626YmuC/MXKII X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en; SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:PA4PR08MB6320.eurprd08.prod.outlook.com; PTR:; CAT:NONE; SFS:(4636009)(346002)(39850400004)(396003)(136003)(376002)(366004)(4326008)(478600001)(66476007)(36756003)(6862004)(66946007)(6636002)(66556008)(26005)(2906002)(54906003)(7696005)(52116002)(44832011)(37006003)(86362001)(316002)(2616005)(956004)(1076003)(33656002)(186003)(8676002)(55016002)(5660300002)(38100700002)(8886007)(38350700002)(8936002)(473944003)(357404004); DIR:OUT; SFP:1101; X-MS-Exchange-AntiSpam-MessageData-ChunkCount: 1 X-MS-Exchange-AntiSpam-MessageData-0: =?utf-8?B?Y0s2RHd5TDBidnYwcFd6VUJuZm9SRzRxdGhISWRRSndYcVJ2RjNiY2lqeEpa?= =?utf-8?B?NW1zWjRtVFZFSHc4RjRqTWFVeHY2UzE4anNyZTBsV0tzYytoV2NsNHlUSVU2?= =?utf-8?B?eWdhbkozQ2hXZ2tjbDFnOTRnUlVxOHg1amJYay9TQXgrR3dTcXc5QmRydFJP?= =?utf-8?B?VkZpQTJUYjVSbmZmOFNSYzZVTmJwTlJxVWR0TEVGd05lMkZhaWFvSHdOb1Q3?= =?utf-8?B?aDRDcFJIT0FsVDdodDEvaG8zRUFBZE9hTldhZmJmdjJvMG9EY2h2MUtPR1FT?= =?utf-8?B?aWltUzNlTDJDZTdHMGZ4Y2FPT000U01Edjkyb3pUVURheUtpSzROclVyYW1M?= =?utf-8?B?MEZnNXE2eXVlMDZjNkRMV2lSZTNIbmdIRUNPTE5TQmU1eWxIZnJrWGRMdTBl?= =?utf-8?B?V2NmazMzWk84aWlHZzl5WnJRNXhpRWhEcFdNbUV1OThDT2t0UWsxdHFBMVBK?= =?utf-8?B?dW9EYW5HRDdhU254SEdGVVE3VlFRRkZBOU5pSnJUcFppUW0zNTNQU2NYS2V1?= =?utf-8?B?WXZKcWl0aGdoeEh3WHFGOUNXcjhZeVVBMW13UDB0ZGRWOVFtTm85aDNLd1JK?= =?utf-8?B?Q0R4clNKOEhockNKRCsxVjhubWIydW1ITW5RRmV4dURRM2pkMEs2Y2xmbE9D?= =?utf-8?B?bHNDVG9XbFA3bW1MY2FGL0t5dzE5VDlESkpjL3lveTY3SmJ3YytjNjhkbFV3?= =?utf-8?B?WW00eXBNazNRN1RSK2UyKzVXOTByTEgxTUhXdll3anpvUjlERTN3SERhWXZP?= =?utf-8?B?U2hEZmp0RjBCQWtaM005aTBRVDRRZks4VG9TRXdsZ0h5RXNOcDR3L212Vnpw?= =?utf-8?B?SzlHR1l6T0pkejR1dUNwVS9kNTI3N3hZOXk0bndsdldSY1V2Q0tsVElQcVpF?= =?utf-8?B?NG9xYmtRTm51R0RtZlRpREw0T1pSeW1vd0lZZmdZNU82M3FLV0ZkbGRQc2dR?= =?utf-8?B?anlsdEkrZ0IzcUJVdWs3MTd3NmladDdYMFE2Wlh6OEdRamN5T0NjdGZOQzVq?= =?utf-8?B?M3hLVlJUNXQ5azdhNkl6bjM4NmUwSkdodlRSSitydWZsOUhRMHhNcW91R2xK?= =?utf-8?B?NVoyUnJNOGpwVVprY016ekhoVTFOTXg1YWRDUUU0bkI1TVFDcWJOL0dqaGpS?= =?utf-8?B?emN2eTAvVTBmL0JIYStkTzFTdURybUYrK3VSZFJCdytERDJWSXJVTnJSV0Fo?= =?utf-8?B?dzlUeFEyQkRUMmZhczJhQXAzM2NPd3FLbjFJTjR4YU9hN2ovRUh5NFlrYnhV?= =?utf-8?B?OXZmdlRDRlZFQkpzQ042RDRnZWZKaHZCN3cxNGxzR3ZWajM0U3R3MHF3WFR3?= =?utf-8?B?RldrdkpmZW1rekFYenRoZU5CT21IUUJKNk55ZE43cHVNWWtuNVBWYjk5YWhr?= =?utf-8?B?eE5DR2JXRitjcGR5amk5bDBMRjZ4aGFuSGFXdlQ4MEFmMW9ZenRVazZZei92?= =?utf-8?B?dFltYVFpR1FvL1Y3L1gySUMvMzE2SUdwUTNyeGFwWEJ3U3NNUjBtaHdYQWJF?= =?utf-8?B?MGhSOGpJakFEWWg0OGJUSERWaUJFdG8xb3F4UnVaeC9DRjB1aG1Pc1V1K0ow?= =?utf-8?B?ajIzUjUwS0NtRUhXZ3F4ZDV2dWNnT3ZPb2pLZXpnbWVxeVVjWDh1VVZ0eWtN?= =?utf-8?B?eElRV0hDSmpvaGdzMkQvUnVFNkpmVnIxdjVTT0h4ZjkzRzFjaUYvWWVwVVA5?= =?utf-8?B?UDlIL0VvNXBCcjBMMmNQY2FyRU56dG1kTkJYcmlWU0lXeWdvVHBrK3lJZ1A4?= =?utf-8?Q?E2aWvwmStP/ewFC+AMQz8ikE2lyu7uSc+54O0Cv?= X-MS-Exchange-Transport-CrossTenantHeadersStamped: PAXPR08MB6445 Original-Authentication-Results: arm.com; dkim=none (message not signed) header.d=none;arm.com; dmarc=none action=none header.from=arm.com; X-EOPAttributedMessage: 0 X-MS-Exchange-Transport-CrossTenantHeadersStripped: DB5EUR03FT045.eop-EUR03.prod.protection.outlook.com X-MS-Office365-Filtering-Correlation-Id-Prvs: 954e81a2-427c-41e4-8bfd-08d95be373d1 X-Microsoft-Antispam: BCL:0; X-Microsoft-Antispam-Message-Info: ZEPxdcREqgSvWR1PGjPOq28WscBcYepVwY94DF1zAgCNbFxoR7IIm6mToXu/LkhRnfTchCBw9RhXD7ub+bwQXV0hlzI2dvAoKlgVimHKWHBIS+7BhWnZ0sWnX8Awq/iPxeKasRD6xIERufO0N1NElsOAu6UKTngK7aY9uAgfsJdg8fuM1CWzYvC8hxVomBXrbp1IDA2BqCto/XNVOgJj4+ScrtVkwTVrxec6oVlOGSJJ7TVC6LSnaCF6RzliIlQzwC7B6JJfnLS10QyElzzcCOOs+8wxhaxeKL0EYaH8+yGGaKDTZfvXfvlEKtSIH7vjrdPkfA0Vobl8MjXBVr5dmwcVFZTFDIKcoMYlEzFCaNId3ioEeJ3YT91JAigC60JQQOZFoCOEJAragvQfbJdBTRI2zicolo7bXVMeCFvIseiZ8jyuZyGMXBTMx74cQYOfZR50xEw7auSzMwJiSFMbJDGcNncBcx5BiX8dPVBgnchK0hXXQZi6A0C1Www/Lo0txrk/WygwLeKr+qK5gZSzj2FwuL8ZV2CV2JJngNZ+37cQQuepN+r5t0y+t0w5VVmwuF+yGw3P7UrHIzxSz7WjKLcdKZvldWat3DXBMe0PnUA1+fRnJ9vewnGymmnQRfxNMdbgYPHRlgHfMAc981FLS+qiaD4al39eGO26sETUqNhrdeP/uvHDI2eNLyXTUkhz/LF8aSoe5EkqqxorQhlD+euUt3PJL0L1t0oeKgNfCzfxVFsylmD1iWx2W+wxKzxG X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:; IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com; PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE; SFS:(4636009)(346002)(136003)(376002)(39850400004)(396003)(36840700001)(46966006)(8676002)(26005)(36860700001)(36756003)(44832011)(186003)(316002)(2906002)(7696005)(55016002)(356005)(82310400003)(81166007)(70586007)(70206006)(478600001)(6862004)(37006003)(1076003)(82740400003)(336012)(47076005)(8936002)(33656002)(956004)(2616005)(6636002)(4326008)(54906003)(86362001)(5660300002)(8886007)(473944003)(357404004); DIR:OUT; SFP:1101; X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 10 Aug 2021 09:44:38.6504 (UTC) X-MS-Exchange-CrossTenant-Network-Message-Id: 40dc973e-dd7c-48e0-0e50-08d95be3787a X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123]; Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com] X-MS-Exchange-CrossTenant-AuthSource: DB5EUR03FT045.eop-EUR03.prod.protection.outlook.com X-MS-Exchange-CrossTenant-AuthAs: Anonymous X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM7PR08MB5301 X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Szabolcs Nagy via Libc-alpha Reply-To: Szabolcs Nagy Cc: 'GNU C Library' Errors-To: libc-alpha-bounces+e=80x24.org@sourceware.org Sender: "Libc-alpha" The 08/09/2021 13:15, Wilco Dijkstra via Libc-alpha wrote: > v4: minor loop change > > Simplify the code for memsets smaller than L1. Improve the unroll8 and L1_prefetch loops. OK to commit, but keep Reviewed-by: Naohiro Tamura (further tweaks can go into follwup commits.) > > --- > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > index 89dba912588c243e67a9527a56b4d3a44659d542..318c6350a31e0fad788b5f2139de645ddc51493f 100644 > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > @@ -30,7 +30,6 @@ > #define L2_SIZE (8*1024*1024) // L2 8MB > #define CACHE_LINE_SIZE 256 > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > -#define rest x2 > #define vector_length x9 > > #if HAVE_AARCH64_SVE_ASM > @@ -89,29 +88,19 @@ ENTRY (MEMSET) > > .p2align 4 > L(vl_agnostic): // VL Agnostic > - mov rest, count > mov dst, dstin > - add dstend, dstin, count > - // if rest >= L2_SIZE && vector_length == 64 then L(L2) > - mov tmp1, 64 > - cmp rest, L2_SIZE > - ccmp vector_length, tmp1, 0, cs > - b.eq L(L2) > - // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch) > - cmp rest, L1_SIZE > - ccmp vector_length, tmp1, 0, cs > - b.eq L(L1_prefetch) > - > + cmp count, L1_SIZE > + b.hi L(L1_prefetch) > > + // count >= 8 * vector_length > L(unroll8): > - lsl tmp1, vector_length, 3 > - .p2align 3 > -1: cmp rest, tmp1 > - b.cc L(last) > - st1b_unroll > + sub count, count, tmp1 > + .p2align 4 > +1: st1b_unroll 0, 7 > add dst, dst, tmp1 > - sub rest, rest, tmp1 > - b 1b > + subs count, count, tmp1 > + b.hi 1b > + add count, count, tmp1 > > L(last): > cmp count, vector_length, lsl 1 > @@ -129,18 +118,22 @@ L(last): > st1b z0.b, p0, [dstend, -1, mul vl] > ret > > -L(L1_prefetch): // if rest >= L1_SIZE > + // count >= L1_SIZE > .p2align 3 > +L(L1_prefetch): > + cmp count, L2_SIZE > + b.hs L(L2) > + cmp vector_length, 64 > + b.ne L(unroll8) > 1: st1b_unroll 0, 3 > prfm pstl1keep, [dst, PF_DIST_L1] > st1b_unroll 4, 7 > prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] > add dst, dst, CACHE_LINE_SIZE * 2 > - sub rest, rest, CACHE_LINE_SIZE * 2 > - cmp rest, L1_SIZE > - b.ge 1b > - cbnz rest, L(unroll8) > - ret > + sub count, count, CACHE_LINE_SIZE * 2 > + cmp count, PF_DIST_L1 > + b.hs 1b > + b L(unroll8) > > // count >= L2_SIZE > .p2align 3 > --