From 0498ad1926750ed6c710e5714068f2be17adff4f Mon Sep 17 00:00:00 2001 From: Alexis Beingessner Date: Wed, 24 Jun 2015 16:21:17 -0700 Subject: [PATCH] vec 1.0 --- vec.md | 520 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 509 insertions(+), 11 deletions(-) diff --git a/vec.md b/vec.md index 71b950b..a2fecf8 100644 --- a/vec.md +++ b/vec.md @@ -1,11 +1,11 @@ % Example: Implementing Vec -TODO: audit for non-ZST offsets from heap::empty - To bring everything together, we're going to write `std::Vec` from scratch. Because the all the best tools for writing unsafe code are unstable, this project will only work on nightly (as of Rust 1.2.0). + + # Layout First off, we need to come up with the struct layout. Naively we want this @@ -63,16 +63,19 @@ as `std::rt::heap::EMPTY`. There are quite a few places where we'll want to use `heap::EMPTY` because there's no real allocation to talk about but `null` would make the compiler angry. -All of the `heap` API is totally unstable under the `alloc` feature, though. +All of the `heap` API is totally unstable under the `heap_api` feature, though. We could trivially define `heap::EMPTY` ourselves, but we'll want the rest of the `heap` API anyway, so let's just get that dependency over with. + + + # Allocating Memory So: ```rust -#![feature(alloc)] +#![feature(heap_api)] use std::rt::heap::EMPTY; use std::mem; @@ -184,6 +187,10 @@ fn grow(&mut self) { Nothing particularly tricky here. Just computing sizes and alignments and doing some careful multiplication checks. + + + + # Push and Pop Alright. We can initialize. We can allocate. Let's actually implement some @@ -240,6 +247,10 @@ pub fn pop(&mut self) -> Option { } ``` + + + + # Deallocating Next we should implement Drop so that we don't massively leaks tons of resources. @@ -270,6 +281,10 @@ impl Drop for Vec { } ``` + + + + # Deref Alright! We've got a decent minimal ArrayStack implemented. We can push, we can @@ -311,6 +326,10 @@ impl DerefMut for Vec { Now we have `len`, `first`, `last`, indexing, slicing, sorting, `iter`, `iter_mut`, and all other sorts of bells and whistles provided by slice. Sweet! + + + + # Insert and Remove Something *not* provided but slice is `insert` and `remove`, so let's do those next. @@ -362,6 +381,10 @@ pub fn remove(&mut self, index: usize) -> T { } ``` + + + + # IntoIter Let's move on to writing iterators. `iter` and `iter_mut` have already been @@ -410,7 +433,22 @@ struct IntoIter { } ``` -And initialize it like this: +One last subtle detail: if our Vec is empty, we want to produce an empty iterator. +This will actually technically fall out doing the naive thing of: + +```text +start = ptr +end = ptr.offset(len) +``` + +However because `offset` is marked as a GEP inbounds instruction, this will tell +llVM that ptr is allocated and won't alias other allocated memory. This is fine +for zero-sized types, as they can't alias anything. However if we're using +heap::EMPTY as a sentinel for a non-allocation for a *non-zero-sized* type, +this can cause undefined behaviour. Alas, we must therefore special case either +cap or len being 0 to not do the offset. + +So this is what we end up with for initialization: ```rust impl Vec { @@ -428,7 +466,12 @@ impl Vec { buf: ptr, cap: cap, start: *ptr, - end: ptr.offset(len as isize), + end: if cap == 0 { + // can't offset off this pointer, it's not allocated! + *ptr + } else { + ptr.offset(len as isize) + } } } } @@ -635,6 +678,10 @@ impl Vec { Much better. + + + + # Drain Let's move on to Drain. Drain is largely the same as IntoIter, except that @@ -674,7 +721,11 @@ impl RawValIter { unsafe fn new(slice: &[T]) -> Self { RawValIter { start: slice.as_ptr(), - end: slice.as_ptr().offset(slice.len() as isize), + end: if slice.len() == 0 { + slice.as_ptr() + } else { + slice.as_ptr().offset(slice.len() as isize) + } } } } @@ -771,6 +822,8 @@ impl Vec { ``` + + # Handling Zero-Sized Types It's time. We're going to fight the spectre that is zero-sized types. Safe Rust @@ -781,13 +834,14 @@ zero-sized types. We need to be careful of two things: * The raw allocator API has undefined behaviour if you pass in 0 for an allocation size. * raw pointer offsets are no-ops for zero-sized types, which will break our - C-style pointer iterator + C-style pointer iterator. Thankfully we abstracted out pointer-iterators and allocating handling into RawValIter and RawVec respectively. How mysteriously convenient. + ## Allocating Zero-Sized Types So if the allocator API doesn't support zero-sized allocations, what on earth @@ -797,13 +851,457 @@ to be considered to store or load them. This actually extends to `ptr::read` and `ptr::write`: they won't actually look at the pointer at all. As such we *never* need to change the pointer. -TODO +Note however that our previous reliance on running out of memory before overflow is +no longer valid with zero-sized types. We must explicitly guard against capacity +overflow for zero-sized types. + +Due to our current architecture, all this means is writing 3 guards, one in each +method of RawVec. + +```rust +impl RawVec { + fn new() -> Self { + unsafe { + // -1 is usize::MAX. This branch should be stripped at compile time. + let cap = if mem::size_of::() == 0 { -1 } else { 0 }; + + // heap::EMPTY doubles as "unallocated" and "zero-sized allocation" + RawVec { ptr: Unique::new(heap::EMPTY as *mut T), cap: cap } + } + } + + fn grow(&mut self) { + unsafe { + let elem_size = mem::size_of::(); + + // since we set the capacity to usize::MAX when elem_size is + // 0, getting to here necessarily means the Vec is overfull. + assert!(elem_size != 0, "capacity overflow"); + + let align = mem::min_align_of::(); + + let (new_cap, ptr) = if self.cap == 0 { + let ptr = heap::allocate(elem_size, align); + (1, ptr) + } else { + let new_cap = 2 * self.cap; + let ptr = heap::reallocate(*self.ptr as *mut _, + self.cap * elem_size, + new_cap * elem_size, + align); + (new_cap, ptr) + }; + + // If allocate or reallocate fail, we'll get `null` back + if ptr.is_null() { oom() } + + self.ptr = Unique::new(ptr as *mut _); + self.cap = new_cap; + } + } +} + +impl Drop for RawVec { + fn drop(&mut self) { + let elem_size = mem::size_of::(); + + // don't free zero-sized allocations, as they were never allocated. + if self.cap != 0 && elem_size != 0 { + let align = mem::min_align_of::(); + + let num_bytes = elem_size * self.cap; + unsafe { + heap::deallocate(*self.ptr as *mut _, num_bytes, align); + } + } + } +} +``` + +That's it. We support pushing and popping zero-sized types now. Our iterators +(that aren't provided by slice Deref) are still busted, though. + + + ## Iterating Zero-Sized Types -TODO +Zero-sized offsets are no-ops. This means that our current design will always +initialize `start` and `end` as the same value, and our iterators will yield +nothing. The current solution to this is to cast the pointers to integers, +increment, and then cast them back: -## Advanced Drain +``` +impl RawValIter { + unsafe fn new(slice: &[T]) -> Self { + RawValIter { + start: slice.as_ptr(), + end: if mem::size_of::() == 0 { + ((slice.as_ptr() as usize) + slice.len()) as *const _ + } else if slice.len() == 0 { + slice.as_ptr() + } else { + slice.as_ptr().offset(slice.len() as isize) + } + } + } +} +``` + +Now we have a different bug. Instead of our iterators not running at all, our +iterators now run *forever*. We need to do the same trick in our iterator impls: + +``` +impl Iterator for RawValIter { + type Item = T; + fn next(&mut self) -> Option { + if self.start == self.end { + None + } else { + unsafe { + let result = ptr::read(self.start); + self.start = if mem::size_of::() == 0 { + (self.start as usize + 1) as *const _ + } else { + self.start.offset(1); + } + Some(result) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.end as usize - self.start as usize; + (len, Some(len)) + } +} + +impl DoubleEndedIterator for RawValIter { + fn next_back(&mut self) -> Option { + if self.start == self.end { + None + } else { + unsafe { + self.end = if mem::size_of::() == 0 { + (self.end as usize - 1) as *const _ + } else { + self.end.offset(-1); + } + Some(ptr::read(self.end)) + } + } + } +} +``` + +And that's it. Iteration works! + + + +# Advanced Drain TODO? Not clear if informative + + + + +# The Final Code + +```rust +#![feature(unique)] +#![feature(heap_api)] + +use std::ptr::{Unique, self}; +use std::rt::heap; +use std::mem; +use std::ops::{Deref, DerefMut}; +use std::marker::PhantomData; + +struct RawVec { + ptr: Unique, + cap: usize, +} + +impl RawVec { + fn new() -> Self { + unsafe { + // -1 is usize::MAX. This branch should be stripped at compile time. + let cap = if mem::size_of::() == 0 { -1 } else { 0 }; + + // heap::EMPTY doubles as "unallocated" and "zero-sized allocation" + RawVec { ptr: Unique::new(heap::EMPTY as *mut T), cap: cap } + } + } + + fn grow(&mut self) { + unsafe { + let elem_size = mem::size_of::(); + + // since we set the capacity to usize::MAX when elem_size is + // 0, getting to here necessarily means the Vec is overfull. + assert!(elem_size != 0, "capacity overflow"); + + let align = mem::min_align_of::(); + + let (new_cap, ptr) = if self.cap == 0 { + let ptr = heap::allocate(elem_size, align); + (1, ptr) + } else { + let new_cap = 2 * self.cap; + let ptr = heap::reallocate(*self.ptr as *mut _, + self.cap * elem_size, + new_cap * elem_size, + align); + (new_cap, ptr) + }; + + // If allocate or reallocate fail, we'll get `null` back + if ptr.is_null() { oom() } + + self.ptr = Unique::new(ptr as *mut _); + self.cap = new_cap; + } + } +} + +impl Drop for RawVec { + fn drop(&mut self) { + let elem_size = mem::size_of::(); + if self.cap != 0 && elem_size != 0 { + let align = mem::min_align_of::(); + + let num_bytes = elem_size * self.cap; + unsafe { + heap::deallocate(*self.ptr as *mut _, num_bytes, align); + } + } + } +} + +pub struct Vec { + buf: RawVec, + len: usize, +} + +impl Vec { + fn ptr(&self) -> *mut T { *self.buf.ptr } + + fn cap(&self) -> usize { self.buf.cap } + + pub fn new() -> Self { + Vec { buf: RawVec::new(), len: 0 } + } + pub fn push(&mut self, elem: T) { + if self.len == self.cap() { self.buf.grow(); } + + unsafe { + ptr::write(self.ptr().offset(self.len as isize), elem); + } + + // Can't fail, we'll OOM first. + self.len += 1; + } + + pub fn pop(&mut self) -> Option { + if self.len == 0 { + None + } else { + self.len -= 1; + unsafe { + Some(ptr::read(self.ptr().offset(self.len as isize))) + } + } + } + + pub fn insert(&mut self, index: usize, elem: T) { + assert!(index <= self.len, "index out of bounds"); + if self.cap() == self.len { self.buf.grow(); } + + unsafe { + if index < self.len { + ptr::copy(self.ptr().offset(index as isize), + self.ptr().offset(index as isize + 1), + self.len - index); + } + ptr::write(self.ptr().offset(index as isize), elem); + self.len += 1; + } + } + + pub fn remove(&mut self, index: usize) -> T { + assert!(index < self.len, "index out of bounds"); + unsafe { + self.len -= 1; + let result = ptr::read(self.ptr().offset(index as isize)); + ptr::copy(self.ptr().offset(index as isize + 1), + self.ptr().offset(index as isize), + self.len - index); + result + } + } + + pub fn into_iter(self) -> IntoIter { + unsafe { + let iter = RawValIter::new(&self); + let buf = ptr::read(&self.buf); + mem::forget(self); + + IntoIter { + iter: iter, + _buf: buf, + } + } + } + + pub fn drain(&mut self) -> Drain { + // this is a mem::forget safety thing. If this is forgotten, we just + // leak the whole Vec's contents. Also we need to do this *eventually* + // anyway, so why not do it now? + self.len = 0; + unsafe { + Drain { + iter: RawValIter::new(&self), + vec: PhantomData, + } + } + } +} + +impl Drop for Vec { + fn drop(&mut self) { + while let Some(_) = self.pop() {} + // allocation is handled by RawVec + } +} + +impl Deref for Vec { + type Target = [T]; + fn deref(&self) -> &[T] { + unsafe { + ::std::slice::from_raw_parts(self.ptr(), self.len) + } + } +} + +impl DerefMut for Vec { + fn deref_mut(&mut self) -> &mut [T] { + unsafe { + ::std::slice::from_raw_parts_mut(self.ptr(), self.len) + } + } +} + + + + + +struct RawValIter { + start: *const T, + end: *const T, +} + +impl RawValIter { + unsafe fn new(slice: &[T]) -> Self { + RawValIter { + start: slice.as_ptr(), + end: if mem::size_of::() == 0 { + ((slice.as_ptr() as usize) + slice.len()) as *const _ + } else if slice.len() == 0 { + slice.as_ptr() + } else { + slice.as_ptr().offset(slice.len() as isize) + } + } + } +} + +impl Iterator for RawValIter { + type Item = T; + fn next(&mut self) -> Option { + if self.start == self.end { + None + } else { + unsafe { + let result = ptr::read(self.start); + self.start = self.start.offset(1); + Some(result) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.end as usize - self.start as usize; + (len, Some(len)) + } +} + +impl DoubleEndedIterator for RawValIter { + fn next_back(&mut self) -> Option { + if self.start == self.end { + None + } else { + unsafe { + self.end = self.end.offset(-1); + Some(ptr::read(self.end)) + } + } + } +} + + + + +pub struct IntoIter { + _buf: RawVec, // we don't actually care about this. Just need it to live. + iter: RawValIter, +} + +impl Iterator for IntoIter { + type Item = T; + fn next(&mut self) -> Option { self.iter.next() } + fn size_hint(&self) -> (usize, Option) { self.iter.size_hint() } +} + +impl DoubleEndedIterator for IntoIter { + fn next_back(&mut self) -> Option { self.iter.next_back() } +} + +impl Drop for IntoIter { + fn drop(&mut self) { + for _ in &mut *self {} + } +} + + + + +pub struct Drain<'a, T: 'a> { + vec: PhantomData<&'a mut Vec>, + iter: RawValIter, +} + +impl<'a, T> Iterator for Drain<'a, T> { + type Item = T; + fn next(&mut self) -> Option { self.iter.next_back() } + fn size_hint(&self) -> (usize, Option) { self.iter.size_hint() } +} + +impl<'a, T> DoubleEndedIterator for Drain<'a, T> { + fn next_back(&mut self) -> Option { self.iter.next_back() } +} + +impl<'a, T> Drop for Drain<'a, T> { + fn drop(&mut self) { + // pre-drain the iter + for _ in &mut self.iter {} + } +} + +/// Abort the process, we're out of memory! +/// +/// In practice this is probably dead code on most OSes +fn oom() { + ::std::process::exit(-1); +} +```