awslabs · andrewhop · Dec 8, 2018 · Dec 10, 2018 · Dec 11, 2018 · Feb 21, 2022
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pseudoprimes"
 version = "0.1.0"
 authors = ["Andrew Hopkins <[email protected]>"]
+edition = "2018"
 
 [dependencies]
 modinverse = "0.1.0"
@@ -10,6 +11,7 @@ threadpool = "1.7"
 lazy_static = "1.1"
 rand = "0.5.5"
 itertools = "0.7.8"
+hashbrown = "0.1"
 
 [dependencies.nix]
 version = "0.9"

diff --git a/LICENSE-THIRD-PARTY.txt b/LICENSE-THIRD-PARTY.txt
@@ -198,6 +198,7 @@ Library.
 ** lazy_static
 ** rand
 ** itertools
+** hashbrown
                              Apache License
                         Version 2.0, January 2004
                      http://www.apache.org/licenses/

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ However, it is hard to parallelize hashmap insertion. If only a single thread is
 The solution to all of these problems is to use a three-phase approach with a Bloom filter. Insertion in a Bloom filter is guaranteed constant time. It is trivial to have multiple threads operating at once, and merging Bloom filters is linear time. The Bloom filter only stores the SSP itself, not the corresponding mask, which makes this a three phase approach: we insert the SSPs for the first half of R into the filter, and then compute the SSPs for the second half of R, checking for membership in the Bloom filter and storing the SSP and the mask in a (small) hashmap. In the third phase, we *recompute* the SSPs for the first half of R, checking the hashmap for membership. In more detail, the three phases are:
 
 ## PHASE 1
-In this stage, we need to record 2^32 64-bit subset products (SSPs) for the first half of R. We build a Bloom filter for this, since parallel insertion is easy, and it is easy to combine the results of two Bloom filters. Since we are inserting 2^32 items, a large Bloom filter is needed (approximately 2^39 bits to get a reasonable false-positive rate). Runtime of the algorithm is bound by memory, so we can increase perforamce by creating two Bloom filters, each of size 2^39 bits, for each of the NUMA nodes on a m5d.24xlarge EC2 instance, and inserting the SSPs into one of the two filters (depending on the NUMA node of the thread running computing the SSP). The resulting Bloom filters are then OR'd together to form two identical Bloom filters, both containing all 2^32 SSPs.
+In this stage, we need to record 2^32 64-bit subset products (SSPs) for the first half of R. We build a Bloom filter for this, since parallel insertion is easy, and it is easy to combine the results of two Bloom filters. Since we are inserting 2^32 items, a large Bloom filter is needed (approximately 2^39 bits to get a reasonable false-positive rate). Runtime of the algorithm is bound by memory, so we can increase performance by creating two Bloom filters, each of size 2^39 bits, for each of the NUMA nodes on a m5d.24xlarge EC2 instance, and inserting the SSPs into one of the two filters (depending on the NUMA node of the thread running computing the SSP). The resulting Bloom filters are then OR'd together to form two identical Bloom filters, both containing all 2^32 SSPs.
 
 ## PHASE 2
 The next phase of the algorithm computes all 2^32 SSPs of the (inverse mod M of the) other half of the set. We check the closest Bloom filter to see if the SSP may be preset. If so, we record this in a map from SSP values to the SSP mask (from this phase) which created the SSP.
@@ -41,7 +41,7 @@ git clone https://github.com/awslabs/fast-pseudoprimes.git
 cd fast-pseudoprimes
 cargo +nightly run --features numa,unstable --release
 ```
-The code takes about 24 seconds to run from start to finish.
+The code takes about 21.9 seconds to run from start to finish.
 
 ## Status of this code
 This code is released as-is, and we have no plans to maintain it. We are happy to accept pull requests.

diff --git a/src/bitset/mod.rs → src/bitset.rs b/src/bitset/mod.rs → src/bitset.rs
@@ -1,4 +1,4 @@
-// mod.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// bitset.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 mod stable;
@@ -14,5 +14,5 @@ pub use self::stable::*;
 
 #[cfg(not(all(feature = "unstable", feature = "numa")))]
 impl BitSet {
-    pub fn on_node(self, node_id: u32) -> Self { self }
+    pub fn on_node(self, _node_id: u32) -> Self { self }
 }
diff --git a/src/bloomfilter/mod.rs → src/bloomfilter.rs b/src/bloomfilter/mod.rs → src/bloomfilter.rs
@@ -1,20 +1,21 @@
-// mod.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// bitset.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use gray_prod_iter::*;
-use progress;
-use numa_threadpool::ThreadPool;
+use crate::gray_prod_iter::*;
+use crate::progress;
+use crate::numa_threadpool::ThreadPool;
 
 use std::sync::{Mutex, Arc, mpsc::channel};
 use std::sync::atomic::{Ordering, AtomicUsize};
-use std::collections::HashMap;
+use hashbrown::HashMap;
 use std::time::Instant;
 
 mod conc_bloom;
-use bloomfilter::conc_bloom::*;
+use crate::bloomfilter::conc_bloom::*;
 
-use magic_numbers::*;
-use modulus::*;
+use crate::magic_numbers::*;
+use crate::modulus::*;
+use crate::time::get_elapsed_time;
 
 const FILTER_SIZE : usize = 1usize << 39;
 const FILTER_HASHES : usize = 2;
@@ -31,8 +32,6 @@ pub fn bloom_t1_kernel<M: Modulus>(
 ) {
     let mut handle = progress.handle();
 
-    let filter = &filter as &BloomFilter<u64>;
-
     for (_k, v) in ProductIter::new(&product_set, start, end) {
         filter.put(&v);
         handle.report(1);
@@ -45,7 +44,7 @@ pub fn bloom_t1_kernel<M: Modulus>(
 /// Details: we create a bloom filter for each NUMA node,
 /// then divides the work up into lots of chunks. 
 /// The subset products for each chunk are inserted into one of the two bloom filters,
-/// and at the end of the comptutation, we "cross_or" the two filters together so that
+/// and at the end of the computation, we "cross_or" the two filters together so that
 /// each bloom filter contains *all* of the subset products.
 /// We output a map from NUMA node ID to a bloom filter,
 /// where each bloom filter contains all subset products in t1.
@@ -142,7 +141,7 @@ pub fn build_t2(
 
     let per_task = total_work / N_TASKS;
 
-    let pool : ThreadPool<Arc<BloomFilter<u64>>> = ThreadPool::new(|node_id| 
+    let pool : ThreadPool<Arc<BloomFilter<u64>>> = ThreadPool::new(|node_id|
         filters.get(&node_id).unwrap_or_else(|| {
             println!("Warning: Couldn't find a T1 for node {}, falling back to arbitrary node", node_id);
             filters.iter().next().unwrap().1
@@ -230,6 +229,7 @@ pub fn final_sieve(
     t1: &[u64],
     t2: &[u64]
 ) -> Vec<Pseudoprime> {
+    let total = Instant::now();
     let t2map = Arc::new(t2map);
     let pool = ThreadPool::new(|_| ());
     let t1_product_set = Arc::new(ProductSet::new(t1_forward, MODULUS));
@@ -264,6 +264,8 @@ pub fn final_sieve(
 
     let t3_misses = t3_misses.load(Ordering::SeqCst);
 
+    println!("[final_sieve] Completed in {}", get_elapsed_time(total));
+
     println!("Found {} pseudoprimes, with {} T3 misses, {} T2 false positives",
         results.len(), t3_misses, t2map.len() - t3_misses - results.len());
 

diff --git a/src/bloomfilter/conc_bloom.rs b/src/bloomfilter/conc_bloom.rs
@@ -5,7 +5,7 @@ use std::hash::{Hasher, Hash, BuildHasher};
 use std::collections::hash_map::RandomState;
 use std::marker::PhantomData;
 
-use bitset::BitSet;
+use crate::bitset::BitSet;
 
 pub struct Builder<T: Hash> {
     hash_states: Vec<RandomState>,

diff --git a/src/gray_prod_iter.rs b/src/gray_prod_iter.rs
@@ -1,7 +1,8 @@
 // gray_prod_iters.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use modulus::*;
+
+use crate::modulus::*;
 
 pub struct ProductSet<M: Modulus + 'static> {
     elems: Vec<u64>,

diff --git a/src/lib.rs b/src/lib.rs
@@ -2,24 +2,17 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #![allow(dead_code)]
+#![cfg_attr(feature = "unstable", feature(duration_as_u128))]
 #![cfg_attr(feature = "unstable", feature(asm))]
 #![cfg_attr(feature = "unstable", feature(core_intrinsics))]
 #![cfg_attr(feature = "unstable", feature(avx512_target_feature))]
 
-#[macro_use]
-extern crate lazy_static;
-extern crate modinverse;
-extern crate rug;
-extern crate threadpool;
-#[cfg(feature="unstable")]
-extern crate libc;
-extern crate itertools;
-
 pub mod mulmod;
 pub mod bloomfilter;
 pub mod progress;
 pub mod gray_prod_iter;
 pub mod magic_numbers;
 pub mod bitset;
 pub mod modulus;
-pub mod numa_threadpool;
+pub mod numa_threadpool;
+pub mod time;
diff --git a/src/magic_numbers.rs b/src/magic_numbers.rs
@@ -1,10 +1,11 @@
 // magic_numbers.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
-
+use lazy_static::*;
 use rug::Integer;
 use rug::integer::IsPrime;
 use itertools::iproduct;
-use modulus::*;
+use crate::modulus::*;
+
 
 pub const M: u64 = 11908862398227544750;
 pub const MAX_R: u64 = 1152921504606846976;

diff --git a/src/main.rs b/src/main.rs
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #![cfg_attr(feature = "unstable", feature(asm))]
+#![cfg_attr(feature = "unstable", feature(duration_as_u128))]
 
 extern crate pseudoprimes;
 extern crate rug;
@@ -10,14 +11,15 @@ extern crate itertools;
 
 use pseudoprimes::*;
 
-use magic_numbers::*;
-use bloomfilter::*;
+use crate::magic_numbers::*;
+use crate::time::get_elapsed_time;
+use crate::bloomfilter::*;
 
 use std::time::Instant;
 
 
 fn main() {
-    let total = Instant::now();
+    let start = Instant::now();
 
     // fp p<=0.001, 32GiB, k=2
     let filter = bloom_t1(&T1_INVERSE);
@@ -32,5 +34,5 @@ fn main() {
         println!("Found passing prime {}, vector {:?}", result.pseudoprime, result.factors);
     }
 
-    println!("Total time: {} seconds, primes found: {}", total.elapsed().as_secs(), results.len());
+    println!("Completed in: {}, primes found: {}", get_elapsed_time(start), results.len());
 }
diff --git a/src/modulus.rs b/src/modulus.rs
@@ -1,7 +1,7 @@
 // modulus.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use magic_numbers::M;
+use crate::magic_numbers::M;
 use modinverse::modinverse;
 
 pub const MODULUS : OptiM = OptiM{};

diff --git a/src/mulmod/mod.rs → src/mulmod.rs b/src/mulmod/mod.rs → src/mulmod.rs
@@ -1,4 +1,4 @@
-// mod.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// bitset.rs Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 #[cfg(all(feature = "unstable", target_arch = "x86_64"))]

diff --git a/src/numa_threadpool.rs b/src/numa_threadpool.rs
@@ -49,12 +49,12 @@ pub use self::numa::*;
 mod numa {
     extern crate nix;
 
-    use libc::{c_char, c_uint, c_int, c_ulong};
     use std::ffi::CString;
     use std::sync::{Arc, Mutex, Condvar};
     use std::thread::{JoinHandle, self};
     use std::collections::VecDeque;
     use std::marker::Send;
+    use std::os::raw::{c_ulong, c_uint, c_int, c_char};
 
     #[repr(C)]
     struct bitmask { 
@@ -148,7 +148,7 @@ mod numa {
 
     fn worker<Context: 'static>(node: &NodeInfo<Context>, cpu_id: c_uint, queue: &WorkQueue<Context>) {
         use self::nix::unistd::gettid;
-        use libc::pid_t;
+        use crate::numa_threadpool::numa::nix::libc::pid_t;
 
         let tid = pid_t::from(gettid());
         unsafe {
@@ -182,7 +182,7 @@ mod numa {
             if unsafe { numa_bitmask_isbitset(numa_all_nodes_ptr, i) } {
                 let mut info = NodeInfo { node_id: (i as c_uint), cpuset: Vec::new(), context: context_ctor(i) };
 
-                let mut bitmask = unsafe { numa_allocate_cpumask() };
+                let bitmask = unsafe { numa_allocate_cpumask() };
                 if unsafe { numa_node_to_cpus(i, bitmask) } != 0 {
                     panic!("numa_node_to_cpus");
                 }