mirror of https://github.com/sunface/rust-course
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
401 lines
69 KiB
401 lines
69 KiB
3 days ago
<html lang="zh-CN" class="light" dir="ltr">
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>CPU 缓存性能优化 todo - Rust语言圣经(Rust Course)</title>
<!-- Custom HTML head -->
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../../favicon.svg">
<link rel="shortcut icon" href="../../favicon.png">
<link rel="stylesheet" href="../../css/variables.css">
<link rel="stylesheet" href="../../css/general.css">
<link rel="stylesheet" href="../../css/chrome.css">
<link rel="stylesheet" href="../../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" href="../../highlight.css">
<link rel="stylesheet" href="../../tomorrow-night.css">
<link rel="stylesheet" href="../../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<link rel="stylesheet" href="../../theme/style.css">
<body class="sidebar-visible no-js">
<div id="body-container">
<!-- Provide site root to javascript -->
var path_to_root = "../../";
var default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? "navy" : "light";
<!-- Work around some values being stored in localStorage wrapped in quotes -->
try {
var theme = localStorage.getItem('mdbook-theme');
var sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
} catch (e) { }
<!-- Set the theme before any content is loaded, prevents flash -->
var theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
var html = document.querySelector('html');
var body = document.querySelector('body');
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
var body = document.querySelector('body');
var sidebar = null;
var sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
sidebar_toggle.checked = sidebar === 'visible';
body.classList.add("sidebar-" + sidebar);
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<div class="sidebar-scrollbox">
<ol class="chapter"><li class="chapter-item affix "><a href="../../about-book.html">关于本书</a></li><li class="chapter-item affix "><a href="../../into-rust.html">进入 Rust 编程世界</a></li><li class="chapter-item affix "><a href="../../first-try/sth-you-should-not-do.html">避免从入门到放弃</a></li><li class="chapter-item affix "><a href="../../community.html">社区和锈书</a></li><li class="spacer"></li><li class="chapter-item affix "><a href="../../some-thoughts.html">Xobserve: 一切皆可观测</a></li><li class="chapter-item affix "><a href="../../beat-ai.html">BeatAI: 工程师 AI 入门圣经</a></li><li class="chapter-item affix "><li class="part-title">Rust 语言基础学习</li><li class="spacer"></li><li class="chapter-item "><a href="../../first-try/intro.html"><strong aria-hidden="true">1.</strong> 寻找牛刀,以便小试</a><a class="toggle"><div>❱</div></a></li><li><ol class="section"><li class="chapter-item "><a href="../../first-try/installation.html"><strong aria-hidden="true">1.1.</strong> 安装 Rust 环境</a></li><li class="chapter-item "><a href="../../first-try/editor.html"><strong aria-hidden="true">1.2.</strong> 墙推 VSCode!</a></li><li class="chapter-item "><a href="../../first-try/cargo.html"><strong aria-hidden="true">1.3.</strong> 认识 Cargo</a></li><li class="chapter-item "><a href="../../first-try/hello-world.html"><strong aria-hidden="true">1.4.</strong> 不仅仅是 Hello world</a></li><li class="chapter-item "><a href="../../first-try/slowly-downloading.html"><strong aria-hidden="true">1.5.</strong> 下载依赖太慢了?</a></li></ol></li><li class="chapter-item "><a href="../../basic/intro.html"><strong aria-hidden="true">2.</strong> Rust 基础入门</a><a class="toggle"><div>❱</div></a></li><li><ol class="section"><li class="chapter-item "><a href="../../basic/variable.html"><strong aria-hidden="true">2.1.</strong> 变量绑定与解构</a></li><li class="chapter-item "><a href="../../basic/base-type/index.html"><strong aria-hidden="true">2.2.</strong> 基本类型</a><a class="toggle"><div>❱</div></a></li><li><ol class="section"><li class="chapter-item "><a href="../../basic/base-type/numbers.html"><strong aria-hidden="true">2.2.1.</strong> 数值类型</a></li><li class="chapter-item "><a href="../../basic/base-type/char-bool.html"><strong aria-hidden="true">2.2.2.</strong> 字符、布尔、单元类型</a></li><li class="chapter-item "><a href="../../basic/base-type/statement-expression.html"><strong aria-hidden="true">2.2.3.</strong> 语句与表达式</a></li><li class="chapter-item "><a href="../../basic/base-type/function.html"><strong aria-hidden="true">2.2.4.</strong> 函数</a></li></ol></li><li class="chapter-item "><a href="../../basic/ownership/index.html"><strong aria-hidden="true">2.3.</strong> 所有权和借用</a><a class="toggle"><div>❱</div></a></li><li><ol class="section"><li class="chapter-item "><a href="../../basic/ownership/ownership.html"><strong aria-hidden="true">2.3.1.</strong> 所有权</a></li><li class="chapter-item "><a href="../../basic/ownership/borrowing.html"><strong aria-hidden="true">2.3.2.</strong> 引用与借用</a></li></ol></li><li class="chapter-item "><a href="../../basic/compound-type/intro.html"><strong aria-hidden="true">2.4.</strong> 复合类型</a><a class="toggle"><div>❱</div></a></li><li><ol class="section"><li class="chapter-item "><a href="../../basic/compound-type/string-slice.html"><strong aria-hidden="true">2.4.1.</strong> 字符串与切片</a></li><li class="chapter-item "><a href="../../basic/compound-type/tuple.html"><strong aria-hidden="true">2.4.2.</strong> 元组</a></li><li class="chapter-item "><a href="../../basic/compound-type/struct.html"><strong aria-hidden="true">2.4.3.</strong> 结构体</a></li><li class="chapter-item "><a href="../../basic/compound-type/enum.html"><strong aria-hidden="true">2.4.4.</strong> 枚举</a></li><li class="chapter-item "><a href="../../basic/compound-type/array.html"><strong aria-hidden="true">2.4.5.</strong> 数组</a></l
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
<!-- Track and set sidebar scroll position -->
var sidebarScrollbox = document.querySelector('#sidebar .sidebar-scrollbox');
sidebarScrollbox.addEventListener('click', function(e) {
if (e.target.tagName === 'A') {
sessionStorage.setItem('sidebar-scroll', sidebarScrollbox.scrollTop);
}, { passive: true });
var sidebarScrollTop = sessionStorage.getItem('sidebar-scroll');
if (sidebarScrollTop) {
// preserve sidebar scroll position when navigating via links within sidebar
sidebarScrollbox.scrollTop = sidebarScrollTop;
} else {
// scroll sidebar to current active section when navigating via "next/previous chapter" buttons
var activeSection = document.querySelector('#sidebar .active');
if (activeSection) {
activeSection.scrollIntoView({ block: 'center' });
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
<button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
<i class="fa fa-search"></i>
<h1 class="menu-title">Rust语言圣经(Rust Course)</h1>
<div class="right-buttons">
<a href="../../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
<a href="https://github.com/sunface/rust-course" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
<a href="https://github.com/sunface/rust-course/edit/main/src/profiling/performance/cpu-cache.md" title="Suggest an edit" aria-label="Suggest an edit">
<i id="git-edit-button" class="fa fa-edit"></i>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
<div id="content" class="content">
<!-- Page table of contents -->
<div class="sidetoc"><nav class="pagetoc"></nav></div>
<h1 id="cpu缓存性能优化"><a class="header" href="#cpu缓存性能优化">CPU缓存性能优化</a></h1>
<h1 id="on-a-use-of-the-repr-attribute-in-rust"><a class="header" href="#on-a-use-of-the-repr-attribute-in-rust">On a use of the "repr" attribute in Rust</a></h1>
<p>Consider we work with the following struct representing a counter,</p>
<pre><pre class="playground"><code class="language-rust edition2021"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct Counter(u64);
<span class="boring">}</span></code></pre></pre>
<p>and we want to increment it with random <code>u8</code> values with the help of a for loop : </p>
<pre><pre class="playground"><code class="language-rust edition2021">use rand::Rng;
fn main() {
let mut counter = Counter(0);
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
counter.0 += rng.gen::<u8>() as u64;
<p>This takes 1.90ms to run on my laptop using <code>cargo run --release</code>. Remember this timing as it will be our reference value :)
Now suppose we were given this struct, holding not 1 but 2 counters : </p>
<pre><pre class="playground"><code class="language-rust edition2021"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct Counters {
c1 : u64,
c2 : u64
<span class="boring">}</span></code></pre></pre>
<p>Using the same approach, performing the increments for the 2 counters in a single-threaded fashion, we would expect to be twice slower (in fact it takes 3.71ms to execute).
Can we do better ? Well, as our 2 counters are independent, we could spawn 2 threads, assign them one counter and increment concurrently ! Given I have 4 CPUs on my laptop, I would expect to be just as fast as the first scenario. Let's see !</p>
<p>First thing, we could create a local variable in each thread which would be incremented and then we would set the counter value to this incremented one (spoiler : good idea). But we could also save these 2 variables and share the <code>Counter</code> between the 2 threads with an <code>Arc</code> (spoiler : definitely not worth). Let's do this second option ! ^^</p>
<p>Doing the following code, </p>
<pre><pre class="playground"><code class="language-rust edition2021">fn main() {
let counters = Arc::new(Counters{c1:0, c2:0});
let counters_clone = counters.clone();
let handler1 = thread::spawn(move || {
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
counters.c1 += rng.gen::<u8>() as u64;
let handler2 = thread::spawn(move || {
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
counters_clone.c2 += rng.gen::<u8>() as u64;
handler1.join(); handler2.join();
<p>we end up with an error : </p>
<p><strong>cannot assign to data in an <code>Arc</code></strong>
<strong>cannot assign</strong>
<strong>help: trait <code>DerefMut</code> is required to modify through a dereference, but it is not implemented for <code>std::sync::Arc<Counters></code>rustc(E0594)</strong></p>
<p>Unlucky. Maybe we could use <strong>atomic types</strong>. These types provide operations that synchronize updates between threads. In fact, as an equivalent of <code>+=</code> we could use the <code>fetch_add</code> method which has the following signature : <code>pub fn fetch_add(&self, val: u64, order: Ordering) -> u64</code>. What should be highlighted is the <code>&self</code>. We could expect a <code>&mut self</code> given the modification we want to perform using it but thanks to the property that an atomic operation is performed without interruptions we don't need exclusive access to the variable to safely update it.
We can solve the error replacing the counter's type by <code>AtomicU64</code> as like that we only require <code>Arc</code> to implement the <code>Deref</code> trait (given the signature of <code>fetch_add</code>) and it is the case !</p>
<p>We so have to change a bit our struct to : </p>
<pre><pre class="playground"><code class="language-rust edition2021"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct Counters {
c1 : AtomicU64,
c2 : AtomicU64,
<span class="boring">}</span></code></pre></pre>
<p>and our code to :</p>
<pre><pre class="playground"><code class="language-rust edition2021">fn main() {
let counters = Arc::new(Counters{
c1 : AtomicU64::new(0),
c2 : AtomicU64::new(0)
let counters_clone = counters.clone();
let handler1 = thread::spawn(move || {
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
counters.c1.fetch_add(rng.gen::<u8>() as u64,Relaxed);
let handler2 = thread::spawn(move || {
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
counters_clone.c2.fetch_add(rng.gen::<u8>() as u64,Relaxed);
<p>We could naturally expect the operation on Atomics to be a bit slower than the ones on <code>u64</code> but let's see !
30.22ms .. ok... that's terrible ^^
Do Atomics operations explain all this ?
I ran a benchmark to compare <code>+=</code> and <code>fetch_add( ,Relaxed)</code> to figure it out : </p>
<pre><pre class="playground"><code class="language-rust edition2021"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let mut sum = 0;
let start = Instant::now();
for _ in 0..10_000_000 {
sum += rng.gen::<u8>() as u64;
println!("time spent u64 sum : {:?}", start.elapsed());
let atomic_sum = AtomicU64::new(0);
let start = Instant::now();
for _ in 0..10_000_000 {
atomic_sum.fetch_add(rng.gen::<u8>() as u64, Relaxed);
println!("time spent AtomicU64 sum : {:?}", start.elapsed());
<span class="boring">}</span></code></pre></pre>
<p>The <code>u64</code> sums takes 20.07ms while the <code>AtomicU64</code> one takes 70.28ms. So we should only be 3 times slower than 2ms but we are 15 times slower how can it be ???</p>
<p>Hint : CPU cache... but why should we care ?
CPU cache is a data storage, located close to CPU, offering a fast access to data.
In a computer, when the CPU needs to read or write a value, it checks if it is present inside the cache or not. If it is the case then the CPU directly uses the cached data. Otherwise, the cache allocates a new entry and copies data from main memory, an entry being of fixed size and called <em>cache line</em>.
CPU cache is relatively small compared to RAM but much faster, and that's why a program should be designed to use as much as possible data lying in cache, based on a locality principle, to avoid expensive access to RAM.</p>
<p>If we represent our current situation it looks like this :
<img src="https://github.com/TC5027/blog/blob/master/pngs/false_sharing.png" alt="figure" /></p>
<p>The red square corresponds to the first counter and the green one to the second. They can potentially lie in the same cache line !</p>
<p>If data is modified through CPU 0 in its L1 cache we expect our computer to reflect the changes both in memory and in the other L1 cache. To ensure this coherency, there exists coherence protocols which can force the <strong>whole cache line</strong> impacted by the change to be propagated through the whole system, in order to update the copies of the value changed.</p>
<p>With that in mind, what is happening in our code comes from that : we suffer from coherency protocol due to our 2 counters lying on the same cache line. Updating first counter through CPU 0 involves an update in the system of the data stored in the cache line where the second counter (unchanged) potentially lies. During this update, CPU 1 cannot access the second counter whereas it is clearly independent from the change made by CPU 0, and that's why we are slow.
How can we solve then ? well by making sure that the counters lie on different cache lines and that's where we can use the <code>repr</code> attribute.</p>
<p>In Rust, we can specify the alignment we want for our type with the <code>repr(align)</code> attribute. We use it like this : </p>
<pre><pre class="playground"><code class="language-rust edition2021"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
struct CachePadded(AtomicU64);
<span class="boring">}</span></code></pre></pre>
<p>A data of alignment X is stored in memory at address multiple of X. Knowing this, giving to our counters an alignment equal to the size of a cache line, we ensure that the 2 counters won't be stored in the same cache line !</p>
<p>We can get the size of cache lines with command <code>getconf LEVEL1_DCACHE_LINESIZE</code>. On my laptop the output value is 64.</p>
<p>With those changes we have now a timing of 7.16ms which seems decent given we work with Atomics. Mission succeeded ! </p>
<p>Finally given my remark at the beginning, I wanted to share a potentially better solution, using local variables in the threads, and channels to communicate these local variables back to the main thread :</p>
<pre><pre class="playground"><code class="language-rust edition2021">use std::sync::mpsc::channel;
fn main() {
let (s1,t1) = channel();
let (s2,t2) = channel();
let h1 = thread::spawn(move || {
let mut local_counter = 0;
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
local_counter += rng.gen::<u8>() as u64;
let h2 = thread::spawn(move || {
let mut local_counter = 0;
let mut rng = rand::thread_rng();
for _ in 0..1_000_000 {
local_counter += rng.gen::<u8>() as u64;
let counter = Counters{c1: t1.recv().unwrap(),c2: t2.recv().unwrap()};
<p>It takes 2.03 ms to execute :)</p>
<h2 id="动态和静态分发"><a class="header" href="#动态和静态分发">动态和静态分发</a></h2>
<div id="giscus-container"></div>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../../profiling/performance/runtime-check.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
<a rel="next prefetch" href="../../profiling/performance/calculate.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
<div style="clear: both"></div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../../profiling/performance/runtime-check.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
<a rel="next prefetch" href="../../profiling/performance/calculate.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
window.playground_copyable = true;
<script src="../../ace.js"></script>
<script src="../../editor.js"></script>
<script src="../../mode-rust.js"></script>
<script src="../../theme-dawn.js"></script>
<script src="../../theme-tomorrow_night.js"></script>
<script src="../../elasticlunr.min.js"></script>
<script src="../../mark.min.js"></script>
<script src="../../searcher.js"></script>
<script src="../../clipboard.min.js"></script>
<script src="../../highlight.js"></script>
<script src="../../book.js"></script>
<script type="text/javascript" charset="utf-8">
var pagePath = "profiling/performance/cpu-cache.md"
<!-- Custom JS scripts -->
<script src="../../assets/custom.js"></script>
<script src="../../assets/bigPicture.js"></script>