diff --git a/binarytrees/binarytree-fast.jl b/binarytrees/binarytrees-fast.jl
similarity index 100%
rename from binarytrees/binarytree-fast.jl
rename to binarytrees/binarytrees-fast.jl
diff --git a/binarytrees/cmain.jl b/binarytrees/cmain.jl
new file mode 100644
index 0000000..2334a11
--- /dev/null
+++ b/binarytrees/cmain.jl
@@ -0,0 +1,65 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+# contributed by Simon Danisch
+# based on the [C++ implementation](https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/binarytrees-gpp-9.html)
+
+using Printf
+struct Node
+    left::UInt32
+    right::UInt32
+end
+function alloc!(pool, left, right)
+    push!(pool, Node(left, right))
+    return length(pool)
+end
+function make(pool, d)
+    d == 0 && return 0
+    alloc!(pool, make(pool, d - 1), make(pool, d - 1))
+end
+check(pool, t::Node) = 1 + check(pool, t.left) + check(pool, t.right)
+function check(pool, node::Integer)
+    node == 0 && return 1
+    @inbounds return check(pool, pool[node])
+end
+function threads_inner(pool, d, min_depth, max_depth)
+    niter = 1 << (max_depth - d + min_depth)
+    c = 0
+    for j = 1:niter
+        c += check(pool, make(pool, d))
+        empty!(pool)
+    end
+    return (niter, d, c)
+end
+
+function loop_depths(io, d, min_depth, max_depth, ::Val{N}) where N
+    threadstore = ntuple(x-> NTuple{3, Int}[], N)
+    Threads.@threads for d in min_depth:2:max_depth
+        pool = Node[]
+        push!(threadstore[Threads.threadid()], threads_inner(pool, d, min_depth, max_depth))
+    end
+    for results in threadstore, result in results
+        @printf(io, "%i\t trees of depth %i\t check: %i\n", result...)
+    end
+end
+function perf_binary_trees(io, N::Int=10)
+    min_depth = 4
+    max_depth = N
+    stretch_depth = max_depth + 1
+    pool = Node[]
+    # create and check stretch tree
+    c = check(pool, make(pool, stretch_depth))
+    @printf(io, "stretch tree of depth %i\t check: %i\n", stretch_depth, c)
+
+    long_lived_tree = make(pool, max_depth)
+
+    loop_depths(io, min_depth, min_depth, max_depth, Val(Threads.nthreads()))
+    @printf(io, "long lived tree of depth %i\t check: %i\n", max_depth, check(pool, long_lived_tree))
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+
+    perf_binary_trees(stdout, parse(Int, ARGS[1]))
+    return 0
+end
+
+julia_main(["21"])
diff --git a/c_vs_julia.jl b/c_vs_julia.jl
new file mode 100644
index 0000000..4549e5f
--- /dev/null
+++ b/c_vs_julia.jl
@@ -0,0 +1,36 @@
+cd(@__DIR__)
+fasta_input = "fasta.txt"
+fasta_gen = joinpath("fasta", "fasta.jl")
+run(pipeline(`$(Base.julia_cmd()) $fasta_gen 25000000` ;stdout = fasta_input))
+
+benchmarks = [
+    ("binarytrees", 21),
+    ("fannkuchredux", 12),
+    ("fasta", 25000000),
+    ("knucleotide", fasta_input),
+    ("mandelbrot", 16000),
+    ("nbody", 50000000),
+    ("pidigits", 10000),
+    ("regexredux", fasta_input),
+    ("revcomp", fasta_input),
+    ("spectralnorm", 5500),
+]
+
+timings = map(benchmarks) do (bench, arg)
+  println(bench)
+  isfile("result.bin") && rm("result.bin")
+  args = [:stdout => "result.bin"]
+  argcmd = ``
+  cmd = if arg isa String
+    @assert isfile(arg)
+    push!(args, :stdin => arg)
+  else
+    argcmd = `$arg`
+  end
+  jltime = withenv("JULIA_NUM_THREADS" => 16) do
+    exe = joinpath("jl_build", bench, "cmain")
+     @elapsed run(pipeline(`$() $argcmd`; args...))
+  end
+  ctime = @elapsed run(pipeline(`./$bench $argcmd`; args...))
+  (julia = jltime, cpp = ctime)
+end
diff --git a/cpp_src/.gitignore b/cpp_src/.gitignore
new file mode 100644
index 0000000..4537490
--- /dev/null
+++ b/cpp_src/.gitignore
@@ -0,0 +1,11 @@
+binarytrees
+fannkuchredux
+fasta
+knucleotide
+mandelbrot
+nbody
+pidigits
+regexredux
+revcomp
+spectralnorm
+*.o
diff --git a/cpp_src/binarytrees.c++ b/cpp_src/binarytrees.c++
new file mode 100644
index 0000000..5e5ed28
--- /dev/null
+++ b/cpp_src/binarytrees.c++
@@ -0,0 +1,139 @@
+/* The Computer Language Benchmarks Game
+ * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+ *
+ * contributed by Jon Harrop
+ * modified by Alex Mizrahi
+ * modified by Andreas Schäfer
+ * very minor omp tweak by The Anh Tran
+ * modified to use apr_pools by Dave Compton
+ *  *reset*
+ */
+
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <apr_pools.h>
+
+
+const size_t    LINE_SIZE = 64;
+
+class Apr
+{
+public:
+    Apr() 
+    {
+        apr_initialize();
+    }
+
+    ~Apr() 
+    {
+        apr_terminate();
+    }
+};
+
+struct Node 
+{
+    Node *l, *r;
+    
+    int check() const 
+    {
+        if (l)
+            return l->check() + 1 + r->check();
+        else return 1;
+    }
+};
+
+class NodePool
+{
+public:
+    NodePool() 
+    {
+        apr_pool_create_unmanaged(&pool);
+    }
+
+    ~NodePool() 
+    {
+        apr_pool_destroy(pool);
+    }
+
+    Node* alloc()
+    {
+        return (Node *)apr_palloc(pool, sizeof(Node));
+    }
+
+    void clear()
+    {
+        apr_pool_clear(pool);
+    }
+
+private:
+    apr_pool_t* pool;
+};
+
+Node *make(int d, NodePool &store)
+{
+    Node* root = store.alloc();
+
+    if(d>0){
+        root->l=make(d-1, store);
+        root->r=make(d-1, store);
+    }else{
+        root->l=root->r=0;
+    }
+
+    return root;
+}
+
+int main(int argc, char *argv[]) 
+{
+    Apr apr;
+    int min_depth = 4;
+    int max_depth = std::max(min_depth+2,
+                             (argc == 2 ? atoi(argv[1]) : 10));
+    int stretch_depth = max_depth+1;
+
+    // Alloc then dealloc stretchdepth tree
+    {
+        NodePool store;
+        Node *c = make(stretch_depth, store);
+        std::cout << "stretch tree of depth " << stretch_depth << "\t "
+                  << "check: " << c->check() << std::endl;
+    }
+
+    NodePool long_lived_store;
+    Node *long_lived_tree = make(max_depth, long_lived_store);
+
+    // buffer to store output of each thread
+    char *outputstr = (char*)malloc(LINE_SIZE * (max_depth +1) * sizeof(char));
+
+    #pragma omp parallel for 
+    for (int d = min_depth; d <= max_depth; d += 2) 
+    {
+        int iterations = 1 << (max_depth - d + min_depth);
+        int c = 0;
+
+        // Create a memory pool for this thread to use.
+        NodePool store;
+
+        for (int i = 1; i <= iterations; ++i) 
+        {
+            Node *a = make(d, store);
+            c += a->check();
+            store.clear();
+        }
+
+        // each thread write to separate location
+        sprintf(outputstr + LINE_SIZE * d, "%d\t trees of depth %d\t check: %d\n",
+           iterations, d, c);
+    }
+
+    // print all results
+    for (int d = min_depth; d <= max_depth; d += 2) 
+        printf("%s", outputstr + (d * LINE_SIZE) );
+    free(outputstr);
+
+    std::cout << "long lived tree of depth " << max_depth << "\t "
+              << "check: " << (long_lived_tree->check()) << "\n";
+
+    return 0;
+}
diff --git a/cpp_src/fannkuchredux.c++ b/cpp_src/fannkuchredux.c++
new file mode 100644
index 0000000..301e6e6
--- /dev/null
+++ b/cpp_src/fannkuchredux.c++
@@ -0,0 +1,198 @@
+// The Computer Language Benchmarks Game
+// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+//
+// Contributed by Dave Compton
+// Based on "fannkuch-redux C gcc #5", contributed by Jeremy Zerfas
+// which in turn was based on the Ada program by Jonathan Parker and 
+// Georg Bauhaus which in turn was based on code by Dave Fladebo, 
+// Eckehard Berns, Heiner Marxen, Hongwei Xi, and The Anh Tran and 
+// also the Java program by Oleg Mazurov.
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+using namespace std;
+
+static int64_t fact[32];
+
+void initializeFact(int n)
+{
+    fact[0] = 1;
+    for (auto i = 1; i <= n; ++i)
+        fact[i] = i * fact[i - 1];
+}
+
+class Permutation 
+{
+  public:
+    Permutation(int n, int64_t start);
+    void advance();
+    int64_t countFlips() const;
+
+  private:
+     vector <int> count;
+     vector <int8_t> current;
+
+};
+
+// 
+// Initialize the current value of a permutation
+// and the cycle count values used to advance .
+// 
+Permutation::Permutation(int n, int64_t start)
+{
+    count.resize(n);
+    current.resize(n);
+
+    // Initialize count 
+    for (auto i = n - 1; i >= 0; --i) 
+    {
+        auto d = start / fact[i];
+        start = start % fact[i];
+        count[i] = d;
+    }
+
+    // Initialize current.
+    for (auto i = 0; i < n; ++i)
+        current[i] = i;
+
+    for (auto i = n - 1; i >= 0; --i) 
+    {
+        auto d = count[i];
+        auto b = current.begin();
+        rotate(b, b + d, b + i + 1);
+    }
+}
+
+//
+// Advance the current permutation to the next in sequence.
+// 
+void Permutation::advance()
+{
+    for (auto i = 1; ;++i) 
+    {
+        // Tried using std::rotate here but that was slower.
+        auto first = current[0];
+        for (auto j = 0; j < i; ++j)
+            current[j] = current[j + 1];
+        current[i] = first;
+
+        ++(count[i]);
+        if (count[i] <= i)
+            break;
+        count[i] = 0;
+    }
+}
+
+//
+// Count the flips required to flip 0 to the front of the vector.
+//
+// Other than minor cosmetic changes, the following routine is
+// basically lifted from "fannkuch-redux C gcc #5"
+//
+inline int64_t Permutation::countFlips() const
+{
+    const auto n = current.size();
+    auto flips = 0;
+    auto first = current[0];
+    if (first > 0) 
+    {
+        flips = 1;
+
+        int8_t temp[n];
+        // Make a copy of current to work on. 
+        for (size_t i = 0; i < n; ++i)
+            temp[i] = current[i];
+
+
+        // Flip temp until the element at the first index is 0
+        for (; temp[first] > 0; ++flips) 
+        {
+            // Record the newFirst and restore the old
+            // first at its new flipped position.
+            const int8_t newFirst = temp[first];
+            temp[first] = first;
+
+            if (first > 2) 
+            {
+                int64_t low = 1, high = first - 1;
+                do 
+                {
+                    swap(temp[low], temp[high]);
+                    if (!(low + 3 <= high && low < 16))
+                        break;
+                    ++low;
+                    --high;
+                } while (1);
+            }
+            // Update first to newFirst that we recorded earlier.
+            first = newFirst;
+        }
+    }
+    return flips;
+}
+
+int main(int argc, char **argv)
+{
+    const auto n = atoi(argv[1]);
+
+    // Compute some factorials for later use.
+    initializeFact(n);
+
+    // blockCount works best if it is set to a multiple of the number
+    // of CPUs so that the same number of blocks gets distributed to
+    // each cpu.  The computer used for development (Intel i7-4700MQ)
+    // had 8 "CPU"s (4 cores with hyperthreading) so 8, 16 and 24 
+    // all worked well.
+
+    auto blockCount = 24;
+    if (blockCount > fact[n])
+        blockCount = 1;
+    const int64_t blockLength = fact[n] / blockCount;
+
+    int64_t maxFlips = 0, checksum = 0;
+
+    // Iterate over each block.
+    #pragma omp parallel for \
+        reduction(max:maxFlips) \
+        reduction(+:checksum)
+
+    for (int64_t blockStart = 0;
+         blockStart < fact[n]; 
+         blockStart += blockLength) 
+    {
+        // first permutation for this block.
+        Permutation permutation(n, blockStart);
+
+        // Iterate over each permutation in the block.
+        auto index = blockStart;
+        while (1) 
+        {
+            const auto flips = permutation.countFlips();
+
+            if (flips) 
+            {
+                if (index % 2 == 0)
+                    checksum += flips;
+                else
+                    checksum -= flips;
+
+                if (flips > maxFlips)
+                    maxFlips = flips;
+            }
+
+            if (++index == blockStart + blockLength)
+                break;
+
+            // next permutation for this block.
+            permutation.advance();
+        }
+    }
+
+    // Output the results to stdout.
+    cout << checksum << endl;
+    cout << "Pfannkuchen(" << n << ") = " << maxFlips << endl;
+
+    return 0;
+}
diff --git a/cpp_src/fasta.c++ b/cpp_src/fasta.c++
new file mode 100644
index 0000000..3587583
--- /dev/null
+++ b/cpp_src/fasta.c++
@@ -0,0 +1,291 @@
+/* The Computer Language Benchmarks Game
+https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+contributed by Sylvester Saguban
+inspired from C++ G++ #7 from Rafal Rusin and contributors
+- uses more features of C++17 and TS concept's 'auto' function parameters
+- more efficient multi-threading, which sadly is the reason why this
+ this has more code than other implementations.
+- data structure from array of struct to struct of arrays for better
+  cache locality and higher chance of vectorization
+
+compile with g++ 7.2
+with these flags -std=c++17 -march=native -O3 -msse -msse2 -msse3
+*/
+
+#include <array>
+#include <cstdio>
+#include <thread>
+#include <algorithm>
+#include <mutex>
+#include <condition_variable>
+#include <atomic>
+
+struct Config
+{
+    static constexpr unsigned max_threads = 8;
+    static constexpr unsigned min_threads = 1;
+    static constexpr unsigned chars_per_line = 60;
+    static constexpr unsigned max_proc = 4;
+    static constexpr unsigned min_thread_work = (chars_per_line + 1) * 84;
+    static constexpr unsigned max_thread_buff = (chars_per_line + 1) * 1024;
+
+    static constexpr unsigned im = 139968, ia = 3877, ic = 29573;
+    static constexpr float reciprocal = 1.0f / im;
+    static inline FILE* output = stdout;
+};
+
+class ThreadMgr
+{
+    struct Proc {
+        std::atomic<bool>   once;
+        std::mutex          mtx;
+        unsigned            index, index_reset;
+    };
+    std::atomic<unsigned>       _out_sequence = 0;
+    std::mutex                  _gen_mutex, _out_mutex;
+    std::condition_variable     _cv;
+    Proc                        _proc[ Config::max_proc ];
+    unsigned                    _id_ctr = 0, _threads = 1, _proc_ctr = 0;
+public:
+    struct Context {
+        unsigned        index;
+        union {
+            alignas(16) std::array< int, Config::max_thread_buff > ibuff;
+            alignas(16) std::array< char, Config::max_thread_buff > cbuff;
+        };
+    };
+
+    void SetThreadCount( unsigned count ) {
+        _threads = std::clamp( count, Config::min_threads,
+                               Config::max_threads );
+    }
+
+    inline void SyncOutput( unsigned index, const auto& function ) {
+        std::unique_lock lock(_out_mutex);
+        _cv.wait( lock, [=]{ return index == _out_sequence; });
+        function();
+        ++_out_sequence;
+        _cv.notify_all();
+    }
+
+    auto Make( const std::string_view& text, unsigned size, auto output ) {
+        unsigned id = _id_ctr++;
+        auto& proc = _proc[ _proc_ctr++ ];
+        return [=, &proc]( ThreadMgr::Context& tdata ) {
+            if( proc.once.exchange(false) )
+                this->SyncOutput( id, [&]{
+                std::fwrite( text.data(), 1, text.size(), Config::output );
+                output( size, tdata );
+            });
+        };
+    }
+
+    auto Make( const std::string_view& text, unsigned size,
+               auto generate, auto convert, auto output ) {
+        unsigned work_count = std::clamp( size / _threads,
+                                          Config::min_thread_work,
+                                          Config::max_thread_buff );
+        unsigned id = _id_ctr++;
+        unsigned limit = id + (size / work_count) + (size % work_count != 0);
+        _id_ctr = limit + 1;
+        auto& proc = _proc[ _proc_ctr++ ];
+        proc.index_reset = id + 1;
+
+        return [=, &proc]( ThreadMgr::Context& tdata ) {
+            if( proc.once.exchange(false) )
+                this->SyncOutput( id, [&] {
+                std::fwrite( text.data(), 1, text.size(), Config::output );
+            });
+
+            unsigned start_index = id + 1;
+            while(true) {
+                unsigned cvsize;
+                {
+                    std::scoped_lock lock(_gen_mutex);
+                    {
+                        std::scoped_lock lock(proc.mtx);
+                        if( proc.index > limit )
+                            break;
+                        tdata.index = proc.index++;
+                    }
+                    cvsize = std::min( work_count, size -
+                                       (((tdata.index - start_index)
+                                         * work_count)) );
+                    generate( cvsize, tdata );
+                }
+
+                unsigned char_count = convert( cvsize, start_index,
+                                               work_count, tdata );
+
+                this->SyncOutput( tdata.index, [&] {
+                    if( tdata.index == limit &&
+                            tdata.cbuff[char_count-1] != '\n' )
+                        tdata.cbuff[char_count++] = '\n';
+                    output( char_count, tdata );
+                });
+            }
+        };
+    }
+
+    void RunSequence( auto... f ){
+        static_assert( sizeof...(f) <= Config::max_proc,
+                       "function chain is too large, "
+                       "increase Config::max_proc." );
+        std::thread threads[ Config::max_threads ];
+
+        _out_sequence = 0;
+        _proc_ctr = 0;
+        for( auto& i : _proc ) {
+            i.once = true;
+            i.index = i.index_reset;
+        }
+
+        for( unsigned i = 0; i < _threads; ++i )
+            threads[i] = std::thread( [=]{
+                Context tdata = {0};
+                (f( tdata ), ...);
+            });
+
+        for( unsigned i = 0; i < _threads; ++i )
+            threads[i].join();
+    }
+} gdata;
+
+int main(int argc, char *argv[])
+{
+    static constexpr char alu[] =
+        "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG"
+        "GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC"
+        "CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC"
+        "GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC"
+        "CGTCTCAAAAA";
+
+    static constexpr std::array< std::pair< float, char >, 15> iub = {{
+         { 0.27f, 'a' }, { 0.12f, 'c' }, { 0.12f, 'g' }, { 0.27f, 't' },
+         { 0.02f, 'B' }, { 0.02f, 'D' }, { 0.02f, 'H' }, { 0.02f, 'K' },
+         { 0.02f, 'M' }, { 0.02f, 'N' }, { 0.02f, 'R' }, { 0.02f, 'S' },
+         { 0.02f, 'V' }, { 0.02f, 'W' }, { 0.02f, 'Y' }
+    }};
+
+    static constexpr std::array< std::pair< float, char >, 4> homosapiens = {{
+         { 0.3029549426680f, 'a' },
+         { 0.1979883004921f, 'c' },
+         { 0.1975473066391f, 'g' },
+         { 0.3015094502008f, 't' }
+    }};
+
+    static auto make_commulative = []( const auto& p ) {
+        struct Ret {
+            alignas(16) std::array< float, p.size() >  real;
+            alignas(16) std::array< char, p.size() >  ch;
+        } ret{{p[0].first}, {p[0].second}};
+
+        for(unsigned i = 1; i < p.size(); ++i ) {
+            ret.real[i] = ret.real[i-1] + p[i].first;
+            ret.ch[i] = p[i].second;
+        }
+        return ret;
+    };
+
+    static auto get_random = [] {
+        static unsigned last = 42;
+        return (last = (last * Config::ia + Config::ic) % Config::im);
+    };
+
+    static auto convert = []( const auto& data, unsigned size,
+            unsigned start_index, unsigned max_work,
+            ThreadMgr::Context& tdata ) {
+        auto iitr = tdata.ibuff.begin(), iend = iitr + size;
+        auto citr = tdata.cbuff.begin();
+        auto inner_loop = [&]( auto end ) {
+            while( iitr != end ) {
+                auto f = *iitr++ * Config::reciprocal;
+                auto i = std::find_if( data.real.begin(),
+                                       data.real.end(),
+                                       [f]( auto t ){ return f <= t; });
+                *citr++ = data.ch[ i - data.real.begin() ];
+            }
+            *citr++ = '\n';
+        };
+
+        unsigned aligner_count = Config::chars_per_line -
+                ((max_work * (tdata.index - start_index))
+                 % Config::chars_per_line);
+
+        if( aligner_count > 0 )
+            inner_loop( iitr + aligner_count );
+
+        unsigned odd_count = (iend - iitr) % Config::chars_per_line;
+        auto iend_blocks = iend - odd_count;
+        while( iitr < iend_blocks )
+            inner_loop( iitr + Config::chars_per_line );
+
+        if( odd_count ) {
+            inner_loop( iitr + odd_count );
+            citr -= 1;
+        }
+        return citr - tdata.cbuff.begin();
+    };
+
+    auto generate = []( unsigned size, ThreadMgr::Context& tdata ) {
+        for( auto i = tdata.ibuff.begin(), end = i + size; i != end; ++i )
+            *i = get_random();
+    };
+
+    auto output = []( unsigned size, ThreadMgr::Context& tdata ) {
+        std::fwrite( tdata.cbuff.data(), 1, size, Config::output );
+    };
+
+    auto alu_out = [&]( unsigned size, ThreadMgr::Context& ) {
+        static constexpr auto alu_data = [&] {
+            constexpr unsigned char_count = (Config::chars_per_line + 1)
+                    * (sizeof(alu) - 1);
+
+            std::array< char, char_count > ret{};
+            for(unsigned i = 0, j = 0; i < char_count; ++i, ++j) {
+                unsigned ul = std::min(i + Config::chars_per_line, char_count);
+                for(; i < ul; ++i )
+                    ret[i] = alu[ (i - j) % (sizeof(alu) - 1)];
+                ret[i] = '\n';
+            }
+            return ret;
+        }();
+
+        constexpr unsigned char_count = Config::chars_per_line *
+                (sizeof(alu) - 1);
+        for(; size >= char_count; size -= char_count )
+            std::fwrite( alu_data.data(), 1, alu_data.size(), Config::output );
+        std::fwrite( alu_data.data(), 1, size +
+                     (size / Config::chars_per_line), Config::output );
+        std::putc('\n', Config::output );
+    };
+
+    auto iub_convert = []( unsigned size, unsigned start_index,
+            unsigned max_work, ThreadMgr::Context& tdata ) {
+        static constexpr auto iub_data = make_commulative( iub );
+        return convert( iub_data, size, start_index, max_work, tdata  );
+    };
+
+    auto homo_convert = []( unsigned size, unsigned start_index,
+            unsigned max_work, ThreadMgr::Context& tdata ) {
+        static constexpr auto hom_data = make_commulative( homosapiens );
+        return convert( hom_data, size, start_index, max_work, tdata  );
+    };
+
+    gdata.SetThreadCount( std::thread::hardware_concurrency() );
+
+    unsigned n = std::max( 1000, std::atoi( argv[1] ) );
+
+    const auto alu_proc = gdata.Make( ">ONE Homo sapiens alu\n",
+                                      n * 2, alu_out );
+
+    const auto iub_proc = gdata.Make( ">TWO IUB ambiguity codes\n",
+                                      n * 3, generate, iub_convert, output );
+
+    const auto homo_proc = gdata.Make( ">THREE Homo sapiens frequency\n",
+                                       n * 5, generate, homo_convert, output );
+
+    gdata.RunSequence( alu_proc, iub_proc, homo_proc );
+    return 0;
+}
diff --git a/cpp_src/knucleotide.c++ b/cpp_src/knucleotide.c++
new file mode 100644
index 0000000..8d9633d
--- /dev/null
+++ b/cpp_src/knucleotide.c++
@@ -0,0 +1,220 @@
+// The Computer Language Benchmarks Game
+// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+//
+// Contributed by Sylvester Saguban
+// taken some inspirations from C++ G++ #3 from Branimir Maksimovic
+//
+// Improvements to algorithm of C++ G++ #3:
+// - Only doing incremental update to key instead of
+//   recomputing it for every insert to hash table,
+//   this aligns it to the other fast implementations.
+//   Notably C GCC, Rust #6, Rust #4.
+// - Returning the hash object by moving instead of
+//   returning by copy.
+// - Using std::thread instead of std::async so routines
+//   are guaranteed to run on their own threads.
+
+// Improvements aimed at better compiler optimizations:
+// - Passing the count/string length as a function
+//   template argument so it is known in advance by
+//   the compiler. For programming languages without
+//   value generics, the same optimization can done
+//   by making those values as constants.
+// - 'Key' class uses this template value so 'size'
+//   member variable is not needed inside the class,
+//   this also reduced the memory usage against the
+//   original implementation.
+
+// compile with these flags:
+// -std=c++17 -march=native -msse -msse2 -msse3 -O3
+
+#include <iostream>
+#include <iomanip>
+#include <cstdint>
+#include <string>
+#include <algorithm>
+#include <map>
+#include <thread>
+#include <type_traits>
+#include <cstring>
+#include <vector>
+#include <cassert>
+#include <ext/pb_ds/assoc_container.hpp>
+
+struct Cfg {
+    static constexpr size_t thread_count = 4;
+    static constexpr unsigned to_char[4] = {'A', 'C', 'T', 'G'};
+    static inline unsigned char to_num[128];
+    using Data = std::vector<unsigned char>;
+
+    Cfg() {
+        to_num['A'] = to_num['a'] = 0;
+        to_num['C'] = to_num['c'] = 1;
+        to_num['T'] = to_num['t'] = 2;
+        to_num['G'] = to_num['g'] = 3;
+    }
+} const cfg;
+
+template <size_t size>
+struct Key
+{
+    // select type to use for 'data', if hash key can fit on 32-bit integer
+    // then use uint32_t else use uint64_t.
+    using Data = typename std::conditional<size<=16, uint32_t, uint64_t>::type;
+
+    struct Hash {
+        Data operator()(const Key& t)const{ return t._data; }
+    };
+
+    Key(Data data) : _data(data) {
+    }
+
+    // uses std::string_view instead of std::string because std::string always
+    // allocates a copy from the heap. while std::string_view is only a wrapper
+    // of a pointer and a size
+    Key(const std::string_view& str) {
+        _data = 0;
+        for(unsigned i = 0; i < size; ++i){
+            _data <<= 2;
+            _data |= cfg.to_num[unsigned(str[i])];
+        }
+    }
+
+    // initialize hash from input data
+    void InitKey(auto data){
+        for(unsigned i = 0; i < size; ++i){
+            _data <<= 2;
+            _data |= data[i];
+        }
+    }
+
+    // updates the key with 1 byte
+    void UpdateKey(auto data){
+        _data <<= 2;
+        _data |= data;
+    }
+
+    // masks out excess information
+    void MaskKey(){
+        _data &= _mask;
+    }
+
+    // implicit casting operator to string
+    operator std::string() const {
+        std::string tmp;
+        Data data = _data;
+        for(size_t i = 0; i != size; ++i, data >>= 2)
+            tmp += cfg.to_char[data & 3ull];
+        std::reverse(tmp.begin(), tmp.end());
+        return std::move(tmp);
+    }
+
+    bool operator== (const Key& in) const {
+        return _data == in._data;
+    }
+private:
+    static constexpr Data _mask = ~(Data(-1) << (2 * size));
+    Data _data;
+};
+
+template <size_t size, typename K = Key<size> >
+using HashTable = __gnu_pbds::cc_hash_table<K, unsigned, typename K::Hash>;
+
+template <size_t size>
+void Calculate(const Cfg::Data& input, size_t begin, HashTable<size>& table)
+{
+    // original implementation fully recomputes the hash key for each
+    // insert to the hash table. This implementation only partially
+    // updates the hash, this is the same with C GCC, Rust #6 and Rust #4
+    Key<size> key(0);
+    // initialize key
+    key.InitKey(input.data() + begin);
+    // use key to increment value
+    ++table[key];
+
+    auto itr_begin = input.data() + begin + cfg.thread_count;
+    auto itr_end = (input.data() + input.size() + 1) - size;
+    for(;itr_begin < itr_end; itr_begin += cfg.thread_count) {
+        // update the key 1 byte at a time
+        constexpr size_t nsize = std::min(size, cfg.thread_count);
+        for(unsigned i = 0; i < nsize; ++i)
+            key.UpdateKey( itr_begin[i] );
+        // then finally mask out excess information
+        key.MaskKey();
+        // then use key to increment value
+        ++table[key];
+    }
+}
+
+template <size_t size>
+auto CalculateInThreads(const Cfg::Data& input)
+{
+    HashTable<size> hash_tables[cfg.thread_count];
+    std::thread threads[cfg.thread_count];
+
+    auto invoke = [&](unsigned begin) {
+        Calculate<size>(input, begin, hash_tables[begin]);
+    };
+
+    for(unsigned i = 0; i < cfg.thread_count; ++i)
+        threads[i] = std::thread(invoke, i);
+
+    for(auto& i : threads)
+        i.join();
+
+    auto& frequencies = hash_tables[0];
+    for(unsigned i = 1 ; i < cfg.thread_count; ++i)
+        for(auto& j : hash_tables[i])
+            frequencies[j.first] += j.second;
+    // return the 'frequency' by move instead of copy.
+    return std::move(frequencies);
+}
+
+template <unsigned size>
+void WriteFrequencies(const Cfg::Data& input)
+{
+    // we "receive" the returned object by move instead of copy.
+    auto&& frequencies = CalculateInThreads<size>(input);
+    std::map<unsigned, std::string, std::greater<unsigned>> freq;
+    for(const auto& i: frequencies)
+        freq.insert({i.second, i.first});
+
+    const unsigned sum = input.size() + 1 - size;
+    for(const auto& i : freq)
+        std::cout << i.second << ' ' << (sum ? double(100 * i.first) / sum : 0.0) << '\n';
+    std::cout << '\n';
+}
+
+template <unsigned size>
+void WriteCount( const Cfg::Data& input, const std::string& text ) {
+    // we "receive" the returned object by move instead of copy.
+    auto&& frequencies = CalculateInThreads<size>(input);
+    std::cout << frequencies[Key<size>(text)] << '\t' << text << '\n';
+}
+
+int main()
+{
+    Cfg::Data data;
+    std::array<char, 256> buf;
+
+    while(fgets(buf.data(), buf.size(), stdin) && memcmp(">THREE", buf.data(), 6));
+    while(fgets(buf.data(), buf.size(), stdin) && buf.front() != '>') {
+        if(buf.front() != ';'){
+            auto i = std::find(buf.begin(), buf.end(), '\n');
+            data.insert(data.end(), buf.begin(), i);
+        }
+    }
+    std::transform(data.begin(), data.end(), data.begin(), [](auto c){
+        return cfg.to_num[c];
+    });
+    std::cout << std::setprecision(3) << std::setiosflags(std::ios::fixed);
+
+    WriteFrequencies<1>(data);
+    WriteFrequencies<2>(data);
+    // value at left is the length of the passed string.
+    WriteCount<3>(data, "GGT");
+    WriteCount<4>(data, "GGTA");
+    WriteCount<6>(data, "GGTATT");
+    WriteCount<12>(data, "GGTATTTTAATT");
+    WriteCount<18>(data, "GGTATTTTAATTTATAGT");
+}
diff --git a/cpp_src/make.jl b/cpp_src/make.jl
new file mode 100644
index 0000000..5ddd2b1
--- /dev/null
+++ b/cpp_src/make.jl
@@ -0,0 +1,10 @@
+cd(@__DIR__)
+include("make_flags.jl")
+for cmd in make_cmds
+  try
+    run(`g++ $cmd`)
+    @info "success"
+  catch e
+    @warn "failed command" exception = e
+  end
+end
diff --git a/cpp_src/make_flags.jl b/cpp_src/make_flags.jl
new file mode 100644
index 0000000..df1e723
--- /dev/null
+++ b/cpp_src/make_flags.jl
@@ -0,0 +1,22 @@
+make_cmds = [
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-fopenmp", "-I/usr/include/apr-1.0", "binarytrees.c++", "-o", "binarytrees.c++.o"],
+   ["binarytrees.c++.o", "-o", "binarytrees", "-fopenmp", "-lapr-1"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++11", "-fopenmp", "fannkuchredux.c++", "-o", "fannkuchredux.c++.o"],
+   ["fannkuchredux.c++.o", "-o", "fannkuchredux", "-fopenmp"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-std=c++17", "fasta.c++", "-o", "fasta.c++.o"],
+   ["fasta.c++.o", "-o", "fasta", "-lpthread"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++17", "knucleotide.c++", "-o", "knucleotide.c++.o"],
+   ["knucleotide.c++.o", "-o", "knucleotide", "-lpthread"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-fopenmp", "mandelbrot.c++", "-o", "mandelbrot.c++.o"],
+   ["mandelbrot.c++.o", "-o", "mandelbrot", "-fopenmp"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "nbody.c++", "-o", "nbody.c++.o"],
+   ["nbody.c++.o", "-o", "nbody"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++14", "-g", "pidigits.c++", "-o", "pidigits.c++.o"],
+   ["pidigits.c++.o", "-o", "pidigits", "-lgmp", "-lgmpxx"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++17", "-fopenmp", "-flto", "regexredux.c++", "-o", "regexredux.c++.o"],
+   ["regexredux.c++.o", "-o", "regexredux", "-fopenmp", "-lpcre"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++11", "-mtune=native", "-mfpmath=sse", "-msse2", "revcomp.c++", "-o", "revcomp.c++.o"],
+   ["revcomp.c++.o", "-o", "revcomp", "-pthread"],
+   ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-fopenmp", "spectralnorm.c++", "-o", "spectralnorm.c++.o"],
+   ["spectralnorm.c++.o", "-o", "spectralnorm", "-fopenmp"],
+]
diff --git a/cpp_src/mandelbrot.c++ b/cpp_src/mandelbrot.c++
new file mode 100644
index 0000000..17a13b8
--- /dev/null
+++ b/cpp_src/mandelbrot.c++
@@ -0,0 +1,207 @@
+// The Computer Language Benchmarks Game
+// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+//
+// Contributed by Kevin Miller ( as C code )
+//
+// Ported to C++ with minor changes by Dave Compton
+// Optimized to x86 by Kenta Yoshimura
+//
+// Compile with following g++ flags
+// Use '-O3 -ffp-contract=off -fno-expensive-optimizations' instead of '-Ofast',
+// because FMA is fast, but different precision to original version
+//   -Wall -O3 -ffp-contract=off -fno-expensive-optimizations -march=native -fopenmp --std=c++14 mandelbrot.cpp
+
+#include <immintrin.h>
+#include <stdio.h>
+#include <stdint.h>
+
+using namespace std;
+
+namespace {
+
+#if defined(__AVX512BW__)
+    typedef __m512d Vec;
+    Vec vec_init(double value)       { return _mm512_set1_pd(value); }
+    bool vec_is_any_le(Vec v, Vec f) { return bool(_mm512_cmp_pd_mask(v, f, _CMP_LE_OS)); }
+    int vec_is_le(Vec v1, Vec v2)    { return _mm512_cmp_pd_mask(v1, v2, _CMP_LE_OS); }
+    const uint8_t k_bit_rev[] =
+    {
+        0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
+        0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
+        0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
+        0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
+        0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
+        0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
+        0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+        0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
+        0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
+        0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
+        0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
+        0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
+        0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
+        0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+        0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
+        0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
+    };
+#elif defined(__AVX__)
+    typedef __m256d Vec;
+    Vec vec_init(double value)       { return _mm256_set1_pd(value); }
+    bool vec_is_any_le(Vec v, Vec f) { Vec m = v<=f; return ! _mm256_testz_pd(m, m); }
+    int vec_is_le(Vec v1, Vec v2)    { return _mm256_movemask_pd(v1 <= v2); }
+    const uint8_t k_bit_rev[] =
+    {
+        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
+        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111
+    };
+#elif defined(__SSE4_1__)
+    typedef __m128d Vec;
+    Vec vec_init(double value)       { return _mm_set1_pd(value); }
+    bool vec_is_any_le(Vec v, Vec f) { __m128i m = __m128i(v<=f); return ! _mm_testz_si128(m, m); }
+    int vec_is_le(Vec v1, Vec v2)    { return _mm_movemask_pd(v1 <= v2); }
+    const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
+#elif defined(__SSSE3__)
+    typedef __m128d Vec;
+    Vec vec_init(double value)       { return _mm_set1_pd(value); }
+    bool vec_is_any_le(Vec v, Vec f) { return bool(_mm_movemask_pd(v<=f)); }
+    int vec_is_le(Vec v1, Vec v2)    { return _mm_movemask_pd(v1 <= v2); }
+    const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
+#endif
+
+    constexpr int k_vec_size = sizeof(Vec) / sizeof(double);
+
+    // Return true iff all of 8 members of vector v1 is
+    // NOT less than or equal to v2.
+    bool vec_all_nle(const Vec* v1, Vec v2)
+    {
+        for ( auto i = 0; i < 8/k_vec_size; i++ ) {
+            if ( vec_is_any_le(v1[i], v2) ) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // Return 8 bit value with bits set iff cooresponding
+    // member of vector value is less than or equal to limit.
+    unsigned pixels(const Vec* value, Vec limit)
+    {
+        unsigned res = 0;
+        for ( auto i = 0; i < 8/k_vec_size; i++ ) {
+            res <<= k_vec_size;
+            res |= k_bit_rev[vec_is_le(value[i], limit)];
+        }
+        return res;
+    }
+
+    //
+    // Do one iteration of mandelbrot calculation for a vector of eight
+    // complex values.  Using Vec to work with groups of doubles speeds
+    // up computations.
+    //
+    void calcSum(Vec* real, Vec* imag, Vec* sum, const Vec* init_real, Vec init_imag)
+    {
+        for ( auto vec = 0; vec < 8/k_vec_size; vec++ ) {
+            auto r2 = real[vec] * real[vec];
+            auto i2 = imag[vec] * imag[vec];
+            auto ri = real[vec] * imag[vec];
+
+            sum[vec] = r2 + i2;
+
+            real[vec]=r2 - i2 + init_real[vec];
+            imag[vec]=ri + ri + init_imag;
+        }
+    }
+
+    //
+    // Do 50 iterations of mandelbrot calculation for a vector of eight
+    // complex values.  Check occasionally to see if the iterated results
+    // have wandered beyond the point of no return (> 4.0).
+    //
+    unsigned mand8(bool to_prune, const Vec* init_real, Vec init_imag)
+    {
+        Vec k4_0 = vec_init(4.0);
+        Vec real[8 / k_vec_size];
+        Vec imag[8 / k_vec_size];
+        Vec sum[8 / k_vec_size];
+        for ( auto k = 0; k < 8/k_vec_size; k++ ) {
+            real[k] = init_real[k];
+            imag[k] = init_imag;
+        }
+
+        if ( to_prune ) {
+            // 4*12 + 2 = 50
+            for ( auto j = 0; j < 12; j++ ) {
+                for ( auto k = 0; k < 4; k++ ) {
+                    calcSum(real, imag, sum, init_real, init_imag);
+                }
+                if ( vec_all_nle(sum, k4_0) ) {
+                    return 0; // prune
+                }
+            }
+            calcSum(real, imag, sum, init_real, init_imag);
+            calcSum(real, imag, sum, init_real, init_imag);
+        } else {
+            // 6*8 + 2 = 50
+            for ( auto j = 0; j < 8; j++ ) {
+                for ( auto k = 0; k < 6; k++ ) {
+                    calcSum(real, imag, sum, init_real, init_imag);
+                }
+            }
+            calcSum(real, imag, sum, init_real, init_imag);
+            calcSum(real, imag, sum, init_real, init_imag);
+        }
+
+        return pixels(sum, k4_0);
+    }
+
+} // namespace
+
+int main(int argc, char ** argv)
+{
+    // get width/height from arguments
+
+    auto wid_ht = 16000;
+    if ( argc >= 2 ) {
+        wid_ht = atoi(argv[1]);
+    }
+
+    // round up to multiple of 8
+    wid_ht = -(-wid_ht & -8);
+    auto width = wid_ht;
+    auto height = wid_ht;
+
+    // allocate memory for pixels.
+    auto dataLength = height*(width>>3);
+    auto pixels = new uint8_t[dataLength];
+
+    // calculate initial x values, store in r0
+    Vec r0[width / k_vec_size];
+    double* r0_ = reinterpret_cast<double*>(r0);
+    for ( auto x = 0; x < width; x++ ) {
+        r0_[x] = 2.0 / width * x - 1.5;
+    }
+
+    // generate the bitmap
+
+    // process 8 pixels (one byte) at a time
+    #pragma omp parallel for schedule(guided)
+    for ( auto y = 0; y < height; y++ ) {
+        // all 8 pixels have same y value (iy).
+        auto iy = 2.0 / height *  y - 1.0;
+        Vec init_imag = vec_init(iy);
+        auto rowstart = y*width/8;
+        bool to_prune = false;
+        for ( auto x = 0; x < width; x += 8 ) {
+            auto res = mand8(to_prune, &r0[x/k_vec_size], init_imag);
+            pixels[rowstart + x/8] = res;
+            to_prune = ! res;
+        }
+    }
+
+    // write the data
+    printf("P4\n%d %d\n", width, height);
+    fwrite(pixels, 1, dataLength, stdout);
+    delete[] pixels;
+
+    return 0;
+}
diff --git a/cpp_src/nbody.c++ b/cpp_src/nbody.c++
new file mode 100644
index 0000000..a364ea7
--- /dev/null
+++ b/cpp_src/nbody.c++
@@ -0,0 +1,182 @@
+/* The Computer Language Benchmarks Game
+   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+   contributed by Mark C. Lewis
+   modified slightly by Chad Whipkey
+   converted from java to c++,added sse support, by Branimir Maksimovic
+   converted from c++ to c, by Alexey Medvedchikov 
+   converted from c to c++11, by Walter Landry
+*/
+
+#include <stdio.h>
+#include <cmath>
+#include <stdlib.h>
+#include <immintrin.h>
+#include <array>
+
+constexpr double PI(3.141592653589793);
+constexpr double SOLAR_MASS ( 4 * PI * PI );
+constexpr double DAYS_PER_YEAR(365.24);
+
+struct body {
+  double x[3], fill, v[3], mass;
+  constexpr body(double x0, double x1, double x2, double v0, double v1, double v2,  double Mass):
+    x{x0,x1,x2}, fill(0), v{v0,v1,v2}, mass(Mass) {}
+};
+
+class N_Body_System
+{
+  static std::array<body,5> bodies;
+
+  void offset_momentum()
+  {
+    unsigned int k;
+    for(auto &body: bodies)
+      for(k = 0; k < 3; ++k)
+        bodies[0].v[k] -= body.v[k] * body.mass / SOLAR_MASS;
+  }
+
+public:
+  N_Body_System()
+  {
+    offset_momentum();
+  }
+  void advance(double dt)
+  {
+    constexpr unsigned int N = ((bodies.size() - 1) * bodies.size()) / 2;
+
+    static double r[N][4];
+    static double mag[N];
+
+    unsigned int i, m;
+    __m128d dx[3], dsquared, distance, dmag;
+
+    i=0;
+    for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi)
+      {
+        auto bj(bi);
+        for(++bj; bj!=bodies.end(); ++bj, ++i)
+          for (m=0; m<3; ++m)
+            r[i][m] = bi->x[m] - bj->x[m];
+      }
+
+    for (i=0; i<N; i+=2)
+      {
+        for (m=0; m<3; ++m)
+          {
+            dx[m] = _mm_loadl_pd(dx[m], &r[i][m]);
+            dx[m] = _mm_loadh_pd(dx[m], &r[i+1][m]);
+          }
+
+        dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
+        distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));
+
+        for (m=0; m<2; ++m)
+          distance = distance * _mm_set1_pd(1.5)
+            - ((_mm_set1_pd(0.5) * dsquared) * distance)
+            * (distance * distance);
+
+        dmag = _mm_set1_pd(dt) / (dsquared) * distance;
+        _mm_store_pd(&mag[i], dmag);
+      }
+
+    i=0;
+    for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi)
+      {
+        auto bj(bi);
+        for(++bj; bj!=bodies.end(); ++bj, ++i)
+          for(m=0; m<3; ++m)
+            {
+              bi->v[m] -= r[i][m] * bj->mass * mag[i];
+              bj->v[m] += r[i][m] * bi->mass * mag[i];
+            }
+      }
+
+    for(auto &body: bodies)
+      for(m=0; m<3; ++m)
+        body.x[m] += dt * body.v[m];
+  }
+
+  double energy()
+  {
+    double e(0.0);
+    for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi)
+      {
+        e += bi->mass * ( bi->v[0] * bi->v[0]
+                          + bi->v[1] * bi->v[1]
+                          + bi->v[2] * bi->v[2] ) / 2.;
+
+        auto bj(bi);
+        for(++bj; bj!=bodies.end(); ++bj)
+          {
+            std::array<double,3> dx;
+            for(auto k=0; k<3; ++k)
+              dx[k] = bi->x[k] - bj->x[k];
+
+            double distance(0);
+            for(auto &d: dx)
+              distance+=d*d;
+            distance=std::sqrt(distance);
+            e -= (bi->mass * bj->mass) / distance;
+          }
+      }
+    return e;
+  }
+};
+
+
+std::array<body,5> N_Body_System::bodies{{
+    /* sun */
+    body(0., 0., 0. ,
+         0., 0., 0. ,
+         SOLAR_MASS),
+    /* jupiter */
+    body(4.84143144246472090e+00,
+         -1.16032004402742839e+00,
+         -1.03622044471123109e-01 ,
+         1.66007664274403694e-03 * DAYS_PER_YEAR,
+         7.69901118419740425e-03 * DAYS_PER_YEAR,
+         -6.90460016972063023e-05 * DAYS_PER_YEAR ,
+         9.54791938424326609e-04 * SOLAR_MASS
+         ),
+    /* saturn */
+    body(8.34336671824457987e+00,
+         4.12479856412430479e+00,
+         -4.03523417114321381e-01 ,
+         -2.76742510726862411e-03 * DAYS_PER_YEAR,
+         4.99852801234917238e-03 * DAYS_PER_YEAR,
+         2.30417297573763929e-05 * DAYS_PER_YEAR ,
+         2.85885980666130812e-04 * SOLAR_MASS
+         ),
+    /* uranus */
+    body(1.28943695621391310e+01,
+         -1.51111514016986312e+01,
+         -2.23307578892655734e-01 ,
+         2.96460137564761618e-03 * DAYS_PER_YEAR,
+         2.37847173959480950e-03 * DAYS_PER_YEAR,
+         -2.96589568540237556e-05 * DAYS_PER_YEAR ,
+         4.36624404335156298e-05 * SOLAR_MASS
+         ),
+    /* neptune */
+    body(1.53796971148509165e+01,
+         -2.59193146099879641e+01,
+         1.79258772950371181e-01 ,
+         2.68067772490389322e-03 * DAYS_PER_YEAR,
+         1.62824170038242295e-03 * DAYS_PER_YEAR,
+         -9.51592254519715870e-05 * DAYS_PER_YEAR ,
+         5.15138902046611451e-05 * SOLAR_MASS
+         )
+  }};
+
+int main(int , char** argv)
+{
+  int i, n = atoi(argv[1]);
+  N_Body_System system;
+
+  printf("%.9f\n", system.energy());
+  for (i = 0; i < n; ++i)
+    system.advance(0.01);
+  printf("%.9f\n", system.energy());
+
+  return 0;
+}
diff --git a/cpp_src/pidigits.c++ b/cpp_src/pidigits.c++
new file mode 100644
index 0000000..43798de
--- /dev/null
+++ b/cpp_src/pidigits.c++
@@ -0,0 +1,68 @@
+/* The Computer Language Benchmarks Game
+ * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+ *
+ * contributed by Alessandro Power
+ */
+
+#include <gmpxx.h>
+#include <cstdlib>
+#include <iostream>
+
+class LFT {
+public:
+    mpz_class q;
+    mpz_class r;
+    mpz_class t;
+    unsigned k;
+
+public:
+    LFT() : q(1), r(0), t(1), k(0){};
+
+    void next() {
+        ++k;
+        r = (2 * k + 1) * (2 * q + r);
+        t = (2 * k + 1) * t;
+        q = q * k;
+    }
+
+    unsigned extract(unsigned x) const {
+        static mpz_class tmp0, tmp1;
+        tmp0 = q * x + r;
+        tmp1 = tmp0 / t;
+        return tmp1.get_ui();
+    }
+
+    void produce(unsigned n) {
+        q = 10 * q;
+        r = 10 * (r - n * t);
+    }
+};
+
+int main(int, char** argv) {
+    std::ios_base::sync_with_stdio(false);
+
+    const std::size_t TOTAL_DIGITS = std::atol(argv[1]);
+
+    LFT lft;
+    std::size_t n_digits = 0;
+    while (n_digits < TOTAL_DIGITS) {
+        std::size_t i = 0;
+        while (i < 10 and n_digits < TOTAL_DIGITS) {
+            lft.next();
+            if (lft.q > lft.r) continue;
+
+            auto digit = lft.extract(3);
+            if (digit == lft.extract(4)) {
+                std::cout << digit;
+                lft.produce(digit);
+                ++i;
+                ++n_digits;
+            }
+        }
+
+        // Pad digits with extra spaces if TOTAL_DIGITS was not a
+        // multiple of 10.
+        for (; i < 10; ++i) std::cout << ' ';
+        std::cout << "\t:" << n_digits << '\n';
+    }
+}
diff --git a/cpp_src/regexredux.c++ b/cpp_src/regexredux.c++
new file mode 100644
index 0000000..5bf82f3
--- /dev/null
+++ b/cpp_src/regexredux.c++
@@ -0,0 +1,141 @@
+/* The Computer Language Benchmarks Game
+   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+   contributed by Filip Sajdak
+*/
+
+#include <pcre.h>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <string_view>
+
+template <typename T>
+auto reserve(size_t size) {
+   T out;
+   out.reserve(size);
+   return out;
+}
+
+template <auto initial_size = 16384, auto buffer_size = initial_size>
+auto load(std::istream& in) {
+      auto str = reserve<std::string>(initial_size);
+      auto buffer = std::array<char, buffer_size>();
+
+      while (in) {
+         in.read(buffer.data(), buffer.size());
+         str.append(buffer.cbegin(), buffer.cbegin()+in.gcount());
+      }
+      
+      return str;
+}
+
+template<typename T, typename Deleter>
+auto make_unique_with_deleter(T* ptr, Deleter&& deleter)
+{
+    return std::unique_ptr<T, Deleter>(ptr, std::forward<Deleter>(deleter));
+} 
+
+   auto make_regex(const std::string_view pattern) {
+      char const* error;
+      int offset;
+      return make_unique_with_deleter(
+         pcre_compile(pattern.data(), 0, &error, &offset, NULL), pcre_free);
+   }
+   
+   auto make_aid(const pcre* regex) {
+      char const* error;
+      return make_unique_with_deleter(
+         pcre_study(regex, PCRE_STUDY_JIT_COMPILE, &error), pcre_free_study);
+   }
+
+static void replace(const std::string_view pattern, const std::string_view replacement,
+       const std::string_view source, std::string& output, pcre_jit_stack* const stack){
+
+   const auto regex = make_regex(pattern);
+   const auto aid   = make_aid(regex.get());
+
+   int pos = 0;
+   
+   for(int match[3]; pcre_jit_exec(regex.get(), aid.get(), source.data(),
+         source.size(), pos, 0, match, 3, stack) >= 0; pos = match[1]){
+      output.append(std::cbegin(source) + pos, std::cbegin(source) + match[0]);
+      output.append(std::cbegin(replacement), std::cend(replacement));
+    }
+
+   output.append(std::cbegin(source) + pos, std::cend(source));
+}
+
+
+int main(void){
+   std::ios::sync_with_stdio(false);
+   
+    char const * const count_Info[]={
+        "agggtaaa|tttaccct",
+        "[cgt]gggtaaa|tttaccc[acg]",
+        "a[act]ggtaaa|tttacc[agt]t",
+        "ag[act]gtaaa|tttac[agt]ct",
+        "agg[act]taaa|ttta[agt]cct",
+        "aggg[acg]aaa|ttt[cgt]ccct",
+        "agggt[cgt]aa|tt[acg]accct",
+        "agggta[cgt]a|t[acg]taccct",
+        "agggtaa[cgt]|[acg]ttaccct"
+      }, * const replace_Info[][2]={
+        {"tHa[Nt]", "<4>"},
+        {"aND|caN|Ha[DS]|WaS", "<3>"},
+        {"a[NSt]|BY", "<2>"},
+        {"<[^>]*>", "|"},
+        {"\\|[^|][^|]*\\|", "-"}
+      };
+
+   auto sequences = reserve<std::string>(16384);
+    size_t postreplace_Size = 0;
+
+   std::string input = load(std::cin);
+
+    #pragma omp parallel
+    {
+      auto stack = make_unique_with_deleter(
+         pcre_jit_stack_alloc(16384, 16384), pcre_jit_stack_free);
+
+        #pragma omp single
+        {
+            replace(">.*\\n|\\n", "", input, sequences, stack.get());
+        }
+
+        #pragma omp single nowait
+        {
+            auto prereplace_String = sequences;
+            auto postreplace_String = reserve<std::string>(sequences.capacity());
+         
+         for ( const auto& [pattern, replacement] : replace_Info ) {
+            postreplace_String.clear();
+            replace(pattern, replacement, 
+               prereplace_String, postreplace_String, stack.get());
+            std::swap(prereplace_String, postreplace_String);
+            }
+
+            postreplace_Size = prereplace_String.size();
+        }
+
+        #pragma omp for schedule(dynamic) ordered
+        for(auto i=0u; i < std::size(count_Info); i++){
+
+         auto regex = make_regex(count_Info[i]);
+         auto aid = make_aid(regex.get());
+         
+         int count = 0;
+         for(int pos = 0, match[3]; pcre_jit_exec(regex.get(), aid.get(),
+               sequences.data(), sequences.size(), pos, 0, match, 3,
+               stack.get()) >= 0; pos=match[1]) {
+            ++count;
+            }
+
+            #pragma omp ordered
+            printf("%s %d\n", count_Info[i], count);
+        }
+    }
+
+    printf("\n%zu\n%zu\n%zu\n", input.size(), sequences.size(), postreplace_Size);
+    return 0;
+}
diff --git a/cpp_src/revcomp.c++ b/cpp_src/revcomp.c++
new file mode 100644
index 0000000..275d96b
--- /dev/null
+++ b/cpp_src/revcomp.c++
@@ -0,0 +1,282 @@
+/* The Computer Language Benchmarks Game
+https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+Contributed by Andrew Moon
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string.h>
+
+struct CPUs {
+   CPUs() {
+      cpu_set_t cs;
+      CPU_ZERO( &cs );
+      sched_getaffinity( 0, sizeof(cs), &cs );
+      count = 0;
+      for ( size_t i = 0; i < CPU_SETSIZE; i++ )
+         count += CPU_ISSET( i, &cs ) ? 1 : 0;
+      count = std::max( count, size_t(1) );
+   }
+
+   size_t count;
+} cpus;
+
+struct ReverseLookup {
+   ReverseLookup( const char *from, const char *to ) {
+      for ( int i = 0; i < 256; i++ )
+         byteLookup[i] = i;
+      for ( ; *from && *to; from++, to++ ) {
+         byteLookup[toupper(*from)] = *to;
+         byteLookup[tolower(*from)] = *to;
+      }
+
+      for ( int i = 0; i < 256; i++ )
+         for ( int j = 0; j < 256; j++ )
+            wordLookup[(i << 8) | j] = ( byteLookup[j] << 8 ) | byteLookup[i];
+   }
+
+   char operator[]( const char &c ) { return (char )byteLookup[(unsigned char )c]; }
+   short operator[]( const short &s ) { return (short )wordLookup[(unsigned short )s]; }
+
+protected:
+   unsigned char byteLookup[256];
+   unsigned short wordLookup[256*256];
+} lookup( "acbdghkmnsrutwvy", "TGVHCDMKNSYAAWBR" );
+
+template< class type >
+struct vector2 : public std::vector<type> {
+   type &last() { return this->operator[]( std::vector<type>::size() -1 ); }
+};
+
+struct Chunker {
+   enum { lineLength = 60, chunkSize = 65536, };
+
+   Chunker( int seq ) : id(seq) {}
+
+   struct Chunk {
+      Chunk() {}
+      Chunk( char *in, size_t amt ) : data(in), size(amt) {}
+      char *data;
+      size_t size;
+   };
+
+   void NewChunk() {
+      size_t cur = mark - chunkBase;
+      chunks.push_back( Chunk( chunkBase, cur ) );
+      chunkBase += ( cur + ( cur & 1 ) ); // keep it word aligned
+      mark = chunkBase;
+   }
+
+   template< int N >
+   struct LinePrinter {
+      LinePrinter() : lineFill(0) {}
+      void endofblock() { if ( lineFill ) newline(); }
+      void emit( const char *str, size_t amt ) { 
+         fwrite_unlocked( str, 1, amt, stdout );
+      }
+      void emit( char c ) { fputc_unlocked( c, stdout ); }
+      void emitnewline() { emit( '\n' ); }
+      void emitlines( char *data, size_t size ) {
+         if ( lineFill ) {
+            size_t toprint = std::min( size, lineLength - lineFill );
+            emit( data, toprint );
+            size -= toprint;
+            data += toprint;
+            lineFill += toprint;
+            if ( lineFill == lineLength )
+               newline();
+         }
+
+         while ( size >= lineLength ) {
+            emit( data, lineLength );
+            emitnewline();
+            size -= lineLength;
+            data += lineLength;
+         }
+
+         if ( size ) {
+            lineFill = size;
+            emit( data, size );
+         }
+      }
+      void newline() { lineFill = 0; emitnewline(); }
+      void reset() { lineFill = 0; }
+   protected:
+      size_t lineFill;
+   };
+
+   void Print() {
+      int prevId = -( id - 1 );
+      while ( __sync_val_compare_and_swap( &printQueue, prevId, id ) != prevId )
+         sched_yield();
+
+      fwrite_unlocked( name, 1, strlen( name ), stdout );
+      static LinePrinter<65536*2> line;
+      line.reset();
+      for ( int i = int(chunks.size()) - 1; i >= 0; i-- )
+         line.emitlines( chunks[i].data, chunks[i].size );
+      line.endofblock();
+
+      __sync_val_compare_and_swap( &printQueue, id, -id );
+   }
+
+   // fseek on stdin seems flaky so this hack. not called often
+   void Backup() {
+      while ( true ) {
+         if ( fgetc_unlocked( stdin ) == '>' ) {
+            fseek( stdin, -1, SEEK_CUR );
+            return;
+         }
+         fseek( stdin, -2, SEEK_CUR );
+      }
+   }
+
+   // input buffer can hold all of stdin, so no size checking
+   size_t Read( char *data ) {
+      if ( feof( stdin ) )
+         return 0;
+
+      name = data;
+      fgets_unlocked( name, 128, stdin );
+      mark = chunkBase = name + strlen( name ) + 1;
+      mark[lineLength] = -1;
+
+      while ( fgets_unlocked( mark, 128, stdin ) ) {
+         if ( *mark == '>' ) {
+            Backup();
+            break;
+         }
+
+         // mark trick should keep us from calling strlen
+         mark += ( mark[lineLength] != 0xa ) ? strlen( mark ) - 1 : lineLength;
+         if ( mark - chunkBase > chunkSize )
+            NewChunk();
+
+         mark[lineLength] = -1;
+      }
+
+      if ( mark - chunkBase )
+         NewChunk();
+      return ( chunkBase - data );
+   }
+
+   struct WorkerState {
+      Chunker *chunker;
+      size_t offset, count;
+      pthread_t handle;
+   };
+
+   static void *ReverseWorker( void *arg ) {
+      WorkerState *state = (WorkerState *)arg;
+      Chunker &chunker = *state->chunker;
+      for ( size_t i = 0; i < state->count; i++ ) {
+         Chunk &chunk = chunker[state->offset + i];
+         short *w = (short *)chunk.data, *bot = w, *top = w + ( chunk.size / 2 ) - 1;
+         for ( ; bot < top; bot++, top-- ) {
+            short tmp = lookup[*bot];
+            *bot = lookup[*top];
+            *top = tmp;
+         }
+         // if size is odd, final byte would reverse to the start (skip it)
+         if ( chunk.size & 1 )
+            chunk.data++;
+      }
+      return 0;
+   }
+
+   void Reverse() {
+      if ( !chunks.size() )
+         return;
+
+      // this takes so little time it's almost not worth parallelizing
+      vector2<WorkerState> threads;
+      threads.reserve( cpus.count );
+      size_t divs = chunks.size() / cpus.count;
+      for ( size_t i = 0, offset = 0; i < cpus.count; i++, offset += divs ) {
+         threads.push_back( WorkerState() );
+         WorkerState &ws = threads.last();
+         ws.chunker = this;
+         ws.count = ( i < cpus.count - 1 ) ? divs : chunks.size() - offset;
+         ws.offset = offset;
+         pthread_create( &ws.handle, 0, ReverseWorker, &ws );
+      }
+
+      for ( size_t i = 0; i < cpus.count; i++ )
+         pthread_join( threads[i].handle, 0 );
+   }
+
+   Chunk &operator[] ( size_t i ) { return chunks[i]; }
+
+protected:
+   vector2<Chunk> chunks;
+   char *name, *chunkBase, *mark;
+   int id;
+   static volatile int printQueue;
+};
+
+// used to order chunk printing
+volatile int Chunker::printQueue = 0;
+
+struct ReverseComplement {
+   ReverseComplement() {
+      // get stdin file size
+      long start = ftell( stdin );
+      fseek( stdin, 0, SEEK_END );
+      size = ftell( stdin ) - start;
+      fseek( stdin, start, SEEK_SET );
+
+      data = new char[size + 3];
+   }
+
+   ~ReverseComplement() {
+      delete[] data;
+   }
+
+   static void *ChunkerThread( void *arg ) {
+      Chunker *chunker = (Chunker *)arg;
+      chunker->Reverse();
+      chunker->Print();
+      return 0;
+   }
+
+   void Run() {
+      vector2<Chunker *> chunkers;
+      vector2<pthread_t> threads;
+
+      size_t cur = 0;
+      for ( int id = 1; true; id++ ) {
+         chunkers.push_back( new Chunker( id ) );
+
+         size_t read = chunkers.last()->Read( data + cur );
+         cur += read;
+         if ( !read )
+            break;
+
+         // spawn off a thread to finish this guy up while we read another chunk in
+         threads.push_back( 0 );
+         pthread_create( &threads.last(), 0, ChunkerThread, chunkers.last() );
+      }
+
+      for ( size_t i = 0; i < threads.size(); i++ )
+         pthread_join( threads[i], 0 );
+
+      for ( size_t i = 0; i < chunkers.size(); i++ )
+         delete chunkers[i];
+   }
+
+
+protected:
+   size_t size;
+   char *data;
+};
+
+
+int main( int argc, const char *argv[] ) {
+   ReverseComplement revcom;
+   revcom.Run();
+   return 0;
+}
diff --git a/cpp_src/spectralnorm.c++ b/cpp_src/spectralnorm.c++
new file mode 100644
index 0000000..36f9bfa
--- /dev/null
+++ b/cpp_src/spectralnorm.c++
@@ -0,0 +1,136 @@
+// The Computer Language Benchmarks Game
+// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+//
+// Original C contributed by Sebastien Loisel
+// Conversion to C++ by Jon Harrop
+// OpenMP parallelize by The Anh Tran
+// Add SSE by The Anh Tran
+// Additional SSE optimization by Krzysztof Jakubowski
+
+// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \
+//     ./spec.c++ -o ./spec.run
+
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <sched.h>
+#include <omp.h>
+#include <emmintrin.h>
+
+template <bool modei> int Index(int i, int j) {
+    return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1;
+}
+
+template <bool modei>
+void EvalPart(double *__restrict__ src, double *__restrict__ dst,
+                int begin, int end, int length) {
+    int i = begin;
+
+    for(; i + 1 < end; i += 2) {
+        __m128d sum = _mm_set_pd(
+			src[0] / double(Index<modei>(i + 1, 0)),
+			src[0] / double(Index<modei>(i + 0, 0)));
+    
+		__m128d ti = modei?
+			_mm_set_pd(i + 1, i + 0) :
+			_mm_set_pd(i + 2, i + 1);	
+		__m128d last = _mm_set_pd(
+			Index<modei>(i + 1, 0),
+			Index<modei>(i + 0, 0));
+
+        for(int j = 1; j < length; j++) {
+			__m128d idx = last + ti + _mm_set1_pd(j);
+			last = idx;
+            sum = sum + _mm_set1_pd(src[j]) / idx;
+        }
+
+        _mm_storeu_pd(dst + i + 0, sum);
+    }
+    for(; i < end; i++) {
+        double sum = 0;
+        for (int j = 0; j < length; j++)
+            sum += src[j] / double(Index<modei>(i, j));
+        dst[i] = sum;
+    }
+
+}
+
+void EvalATimesU(double *src, double *dst, int begin, int end, int N) {
+    EvalPart<1>(src, dst, begin, end, N);
+}
+
+void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) {
+    EvalPart<0>(src, dst, begin, end, N);
+}
+
+void EvalAtATimesU(double *src, double *dst, double *tmp,
+                   int begin, int end, int N) {
+    EvalATimesU (src, tmp, begin, end, N);
+    #pragma omp barrier
+    EvalAtTimesU(tmp, dst, begin, end, N);
+    #pragma omp barrier
+}
+
+int GetThreadCount() {
+    cpu_set_t cs;
+    CPU_ZERO(&cs);
+    sched_getaffinity(0, sizeof(cs), &cs);
+
+    int count = 0;
+    for (int i = 0; i < CPU_SETSIZE; ++i)
+        if (CPU_ISSET(i, &cs))
+            ++count;
+
+    return count;
+}
+
+double spectral_game(int N) {
+    __attribute__((aligned(16))) double u[N];
+    __attribute__((aligned(16))) double v[N], tmp[N];
+
+    double vBv = 0.0;
+    double vv = 0.0;
+
+    #pragma omp parallel default(shared) num_threads(GetThreadCount())
+    {
+        // this block will be executed by NUM_THREADS
+        // variable declared in this block is private for each thread
+        int threadid = omp_get_thread_num();
+        int threadcount = omp_get_num_threads();
+        int chunk = N / threadcount;
+
+        // calculate each thread's working range [r1 .. r2) => static schedule
+        int begin = threadid * chunk;
+        int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;
+
+        for(int i = begin; i < end; i++)
+            u[i] = 1.0;
+        #pragma omp barrier
+
+        for (int ite = 0; ite < 10; ++ite) {
+            EvalAtATimesU(u, v, tmp, begin, end, N);
+            EvalAtATimesU(v, u, tmp, begin, end, N);
+        }
+    
+        double sumvb = 0.0, sumvv = 0.0;
+        for (int i = begin; i < end; i++) {
+            sumvv += v[i] * v[i];
+            sumvb += u[i] * v[i];
+        }
+
+        #pragma omp critical
+        {
+            vBv += sumvb;
+            vv += sumvv;
+        }
+    }
+
+    return sqrt(vBv / vv);
+}
+
+int main(int argc, char *argv[]) {
+    int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
+    printf("%.9f\n", spectral_game(N));
+    return 0;
+}
+
diff --git a/fannkuchredux/cmain.jl b/fannkuchredux/cmain.jl
new file mode 100644
index 0000000..40b8553
--- /dev/null
+++ b/fannkuchredux/cmain.jl
@@ -0,0 +1,180 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+# based on Oleg Mazurov's Java Implementation and Jeremy Zerfas' C implementation
+# transliterated by Hamza Yusuf Çakır
+
+global const preferred_num_blocks = 24
+
+struct Fannkuch
+    n::Int64
+    blocksz::Int64
+    maxflips::Vector{Int32}
+    chksums::Vector{Int32}
+
+    function Fannkuch(n, nthreads)
+        nfact = factorial(n)
+
+        blocksz = nfact ÷ (nfact < preferred_num_blocks ? 1 : preferred_num_blocks)
+        maxflips = zeros(Int32, nthreads)
+        chksums = zeros(Int32, nthreads)
+
+        new(n, blocksz, maxflips, chksums)
+    end
+end
+
+struct Perm
+    p::Vector{Int8}
+    pp::Vector{Int8}
+    count::Vector{Int32}
+
+    function Perm(n)
+        p = zeros(Int8, n)
+        pp = zeros(Int8, n)
+        count = zeros(Int32, n)
+
+        new(p, pp, count)
+    end
+end
+
+Base.@propagate_inbounds @inline function first_permutation(perm::Perm, idx)
+    p = perm.p
+    pp = perm.pp
+
+    for i = 2:length(p)
+        p[i] = i - 1
+    end
+
+    for i = length(p):-1:2
+        ifact = factorial(i-1)
+        d = idx ÷ ifact
+        perm.count[i] = d
+        idx = idx % ifact
+
+        for j = 1:i
+            pp[j] = p[j]
+        end
+
+        for j = 1:i
+            p[j] = j+d <= i ? pp[j+d] : pp[j+d-i]
+        end
+    end
+end
+
+Base.@propagate_inbounds @inline function next_permutation(perm::Perm)
+    p = perm.p
+    count = perm.count
+
+    first = p[2]
+    p[2]  = p[1]
+    p[1]  = first
+
+    i = 2
+    while count[i] >= i - 1
+        count[i] = 0
+
+        next = p[1] = p[2]
+
+        for j = 1:i
+            p[j] = p[j+1]
+        end
+
+        i += 1
+        p[i] = first
+        first = next
+    end
+    count[i] += 1
+    nothing
+end
+
+Base.@propagate_inbounds @inline function count_flips(perm::Perm)
+    p = perm.p
+    pp = perm.pp
+
+    flips = Int32(1)
+
+    first = p[1] + 1
+
+    if p[first] != 0
+
+        unsafe_copyto!(pp, 2, p, 2, length(p) - 1)
+
+        while true
+            flips += one(flips)
+            new_first = pp[first]
+            pp[first] = first - 1
+
+            if first > 3
+                lo = 2; hi = first - 1
+                # see the note in Jeremy Zerfas' C implementation for
+                # this loop
+                for k = 0:13
+                    t = pp[lo]
+                    pp[lo] = pp[hi]
+                    pp[hi] = t
+                    (hi < lo + 3) && break
+                    lo += 1
+                    hi -= 1
+                end
+            end
+
+            first = new_first + 1
+            pp[first] == 0 && break
+        end
+    end
+
+    return flips
+end
+
+Base.@propagate_inbounds function run_task(f::Fannkuch, perm::Perm, idxmin, idxmax)
+    maxflips = Int32(0)
+    chksum = Int32(0)
+
+    i = idxmin
+    while true
+        if perm.p[1] != 0
+            flips = count_flips(perm)
+            maxflips = max(maxflips, flips)
+            chksum += iseven(i) ? flips : -flips
+        end
+        i != idxmax || break
+        i += 1
+        next_permutation(perm)
+    end
+
+    id = Threads.threadid()
+    f.maxflips[id] = max(f.maxflips[id], maxflips)
+    f.chksums[id] += chksum
+    nothing
+end
+
+function runf(f::Fannkuch)
+    factn = factorial(f.n)
+
+    Threads.@threads for idxmin = 0:f.blocksz:factn-1
+        perm = Perm(f.n)
+        @inbounds first_permutation(perm, idxmin)
+        idxmax = idxmin + f.blocksz - 1
+        @inbounds run_task(f, perm, idxmin, idxmax)
+    end
+end
+
+function fannkuchredux(n)
+    f = Fannkuch(n, Threads.nthreads())
+
+    runf(f)
+
+    # reduce results
+    chk = sum(f.chksums)
+    res = maximum(f.maxflips)
+
+    println(chk, "\nPfannkuchen(", n, ") = ", res)
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+
+    n = parse(Int, ARGS[1])
+    fannkuchredux(n)
+    return 0
+end
+
+julia_main(["12"])
diff --git a/fasta/cmain.jl b/fasta/cmain.jl
new file mode 100644
index 0000000..a33dc7e
--- /dev/null
+++ b/fasta/cmain.jl
@@ -0,0 +1,95 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+const line_width = 60
+
+const alu = string(
+   "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG",
+   "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA",
+   "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT",
+   "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA",
+   "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG",
+   "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC",
+   "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA")
+
+const iub1 = b"acgtBDHKMNRSVWY"
+const iub2 = [0.27, 0.12, 0.12, 0.27, 0.02,0.02, 0.02, 0.02, 0.02, 0.02,0.02, 0.02, 0.02, 0.02, 0.02]
+
+const homosapiens1 = b"acgt"
+const homosapiens2 = [0.3029549426680, 0.1979883004921,0.1975473066391, 0.3015094502008]
+
+const IM   = Int32(139968)
+const IA   = Int32(3877)
+const IC   = Int32(29573)
+const state = Ref(Int32(42))
+gen_random() = (state[] = ((state[] * IA) + IC) % IM)
+
+function repeat_fasta(io, src, n)
+    k = length(src)
+    s = string(src, src, src[1:(n % k)])
+    I = Iterators.cycle(src)
+    col = 1
+    count = 1
+    c, state = iterate(I)
+    write(io, c % UInt8)
+    while count < n
+        col += 1
+        c, state = iterate(I, state)
+        write(io, c % UInt8)
+        if col == line_width
+            write(io, '\n')
+            col = 0
+        end
+        count += 1
+    end
+    write(io, '\n')
+    return
+end
+
+function choose_char(cs)
+    k = length(cs)
+    r = gen_random() / IM
+    r < cs[1] && return 1
+    a = 1
+    b = k
+    while b > a + 1
+        c = fld(a + b, 2)
+        if r < cs[c]
+            b = c
+        else
+            a = c
+        end
+    end
+    return b
+end
+
+function random_fasta(io, symb, pr, n)
+    cs = cumsum(pr)
+    k = n
+    while k > 0
+        m = min(k, line_width)
+        @inbounds for i = 1:m
+            write(io, symb[choose_char(cs)])
+        end
+        write(io, '\n'%UInt8)
+        k -= line_width
+    end
+    return
+end
+
+function perf_fasta(n=25000000, io = stdout)
+  write(io, ">ONE Homo sapiens alu\n")
+  repeat_fasta(io, alu, 2n)
+
+  write(io, ">TWO IUB ambiguity codes\n")
+  random_fasta(io, iub1, iub2, 3n)
+  write(io, ">THREE Homo sapiens frequency\n")
+  random_fasta(io, homosapiens1, homosapiens2, 5n)
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    n = parse(Int, ARGS[1])
+    perf_fasta(n)
+    return 0
+end
+
+perf_fasta(25000, IOBuffer())
diff --git a/fasta/fasta-fast.jl b/fasta/fasta-fast.jl
index c7c1822..def6b92 100644
--- a/fasta/fasta-fast.jl
+++ b/fasta/fasta-fast.jl
@@ -3,14 +3,7 @@
 
 const line_width = 60
 
-const alu = string(
-   "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG",
-   "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA",
-   "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT",
-   "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA",
-   "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG",
-   "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC",
-   "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA")
+const alu = b"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
 
 const iub1 = b"acgtBDHKMNRSVWY"
 const iub2 = [0.27, 0.12, 0.12, 0.27, 0.02,0.02, 0.02, 0.02, 0.02, 0.02,0.02, 0.02, 0.02, 0.02, 0.02]
@@ -31,18 +24,18 @@ function repeat_fasta(io, src, n)
     col = 1
     count = 1
     c, state = iterate(I)
-    write(io, c % UInt8)
+    write(io, c)
     while count < n
         col += 1
         c, state = iterate(I, state)
-        write(io, c % UInt8)
+        write(io, c)
         if col == line_width
-            write(io, '\n')
+            write(io, '\n' % UInt8)
             col = 0
         end
         count += 1
     end
-    write(io, '\n')
+    write(io, '\n' % UInt8)
     return
 end
 
@@ -77,7 +70,7 @@ function random_fasta(io, symb, pr, n)
     return
 end
 
-function perf_fasta(_n=25000000, io = stdout)
+function perf_fasta(n=25000000, io = stdout)
   write(io, ">ONE Homo sapiens alu\n")
   repeat_fasta(io, alu, 2n)
 
@@ -87,5 +80,5 @@ function perf_fasta(_n=25000000, io = stdout)
   random_fasta(io, homosapiens1, homosapiens2, 5n)
 end
 
-n = parse(Int,ARGS[1])
-perf_fasta(n)
+# n = parse(Int,ARGS[1])
+@time perf_fasta(25000000, IOBuffer())
diff --git a/jl_build/.gitignore b/jl_build/.gitignore
new file mode 100644
index 0000000..2a928ab
--- /dev/null
+++ b/jl_build/.gitignore
@@ -0,0 +1,13 @@
+image
+image.jl
+*-input.txt
+cdriver/binarytrees.c
+cdriver/fannkuchredux.c
+cdriver/fasta.c
+cdriver/knucleotide.c
+cdriver/mandelbrot.c
+cdriver/nbody.c
+cdriver/pidigits.c
+cdriver/regexredux.c
+cdriver/revcomp.c
+cdriver/spectralnorm.c
diff --git a/jl_build/cdriver/program.c b/jl_build/cdriver/program.c
new file mode 100644
index 0000000..67a6d0a
--- /dev/null
+++ b/jl_build/cdriver/program.c
@@ -0,0 +1,54 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+// Standard headers
+#include <string.h>
+#include <stdint.h>
+
+// Julia headers (for initialization and gc commands)
+#include "uv.h"
+#include "julia.h"
+
+#ifdef JULIA_DEFINE_FAST_TLS // only available in Julia v0.7 and above
+JULIA_DEFINE_FAST_TLS()
+#endif
+
+// Declare C prototype of a function defined in Julia
+extern int julia_main(jl_array_t*);
+
+// main function (windows UTF16 -> UTF8 argument conversion code copied from julia's ui/repl.c)
+int main(int argc, char *argv[])
+{
+    int retcode;
+    int i;
+    uv_setup_args(argc, argv); // no-op on Windows
+
+    // initialization
+    libsupport_init();
+
+    // jl_options.compile_enabled = JL_OPTIONS_COMPILE_OFF;
+    // JULIAC_PROGRAM_LIBNAME defined on command-line for compilation
+    jl_options.image_file = JULIAC_PROGRAM_LIBNAME;
+    julia_init(JL_IMAGE_JULIA_HOME);
+
+    // Initialize Core.ARGS with the full argv.
+    jl_set_ARGS(argc, argv);
+
+    // Set PROGRAM_FILE to argv[0].
+    jl_set_global(jl_base_module,
+        jl_symbol("PROGRAM_FILE"), (jl_value_t*)jl_cstr_to_string(argv[0]));
+
+    // Set Base.ARGS to `String[ unsafe_string(argv[i]) for i = 1:argc ]`
+    jl_array_t *ARGS = (jl_array_t*)jl_get_global(jl_base_module, jl_symbol("ARGS"));
+    jl_array_grow_end(ARGS, argc - 1);
+    for (i = 1; i < argc; i++) {
+        jl_value_t *s = (jl_value_t*)jl_cstr_to_string(argv[i]);
+        jl_arrayset(ARGS, s, i - 1);
+    }
+
+    // call the work function, and get back a value
+    retcode = julia_main(ARGS);
+
+    // Cleanup and gracefully exit
+    jl_atexit_hook(retcode);
+    return retcode;
+}
diff --git a/jl_build/make.jl b/jl_build/make.jl
new file mode 100644
index 0000000..676097c
--- /dev/null
+++ b/jl_build/make.jl
@@ -0,0 +1,60 @@
+using PackageCompiler
+using Libdl
+
+benchmarks = [
+    ("binarytrees"),
+    ("fannkuchredux"),
+    ("fasta"),
+    ("knucleotide"),
+    ("mandelbrot"),
+    ("nbody"),
+    ("pidigits"),
+    ("regexredux"),
+    ("revcomp"),
+    ("spectralnorm"),
+]
+
+cc_flags = String[]
+if Sys.iswindows()
+    using WinRPM
+    push!(cc_flags, "-I" * joinpath(WinRPM.installdir, "usr", "$(Sys.ARCH)-w64-mingw32", "sys-root", "mingw", "include"))
+end
+
+# Concat all cmains into one shared image (speeds up compilation a lot)!
+@info "creating one big image file"
+open(joinpath(@__DIR__, "image.jl"), "w") do io
+    for bench in benchmarks
+        input = joinpath(@__DIR__, "..", bench, string(bench, "-input.txt"))
+        isfile(input) && cp(input, joinpath(@__DIR__, string(bench, "-input.txt")), force = true)
+        jlfile = joinpath(@__DIR__, "..", bench, "cmain.jl")
+        println(io, "module ", titlecase(bench))
+        print(io, replace(read(jlfile, String), "julia_main" => string(bench, "_main")))
+        println(io, "export ", string(bench, "_main"))
+        println(io, "end\nusing .", titlecase(bench))
+        println(io)
+    end
+end
+# build the image/shared library
+@info "building shared image file"
+
+build_shared_lib(
+    joinpath(@__DIR__, "image.jl"), "bench_image", # Julia script containing a `julia_main` function, e.g. like `examples/hello.jl`
+    builddir = joinpath(@__DIR__, "image"), # that's where the compiled artifacts will end up [optional]
+)
+
+# Create an executable for each benchmark, linking into the image
+@info "building executables"
+mkpath(joinpath(@__DIR__, "cdriver"))
+for bench in benchmarks
+    cprog = joinpath(@__DIR__, "cdriver", "program.c")
+    cfile = joinpath(@__DIR__, "cdriver", string(bench, ".c"))
+    open(cfile, "w") do io
+        print(io, replace(read(cprog, String), "julia_main" => string(bench, "_main")))
+    end
+    PackageCompiler.build_exec(
+        bench, cfile,
+        escape_string(joinpath(@__DIR__, "image", "bench_image.$(Libdl.dlext)")),
+        joinpath(@__DIR__, "image"),
+        true, "3", false, PackageCompiler.system_compiler, cc_flags
+    )
+end
diff --git a/knucleotide/cmain.jl b/knucleotide/cmain.jl
new file mode 100644
index 0000000..e38558d
--- /dev/null
+++ b/knucleotide/cmain.jl
@@ -0,0 +1,161 @@
+# Th# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+#
+# contributed by David Campbell
+# based on the Go version
+# modified by Jarrett Revels, Alex Arslan, Yichao Yu
+#
+# Bit-twiddle optimizations added by Kristoffer Carlsson
+
+using Printf
+
+const NucleotideLUT = zeros(UInt8, 256)
+NucleotideLUT['A'%UInt8] = 0
+NucleotideLUT['C'%UInt8] = 1
+NucleotideLUT['G'%UInt8] = 2
+NucleotideLUT['T'%UInt8] = 3
+NucleotideLUT['a'%UInt8] = 0
+NucleotideLUT['c'%UInt8] = 1
+NucleotideLUT['g'%UInt8] = 2
+NucleotideLUT['t'%UInt8] = 3
+
+struct KNucleotides{L, T}
+    i::T
+end
+Base.hash(kn::KNucleotides, h::UInt64) = hash(kn.i, h)
+Base.isequal(kn1::KNucleotides, kn2::KNucleotides) = kn1.i == kn2.i
+Base.show(io::IO, kn::KNucleotides) = print(io, '[', string(kn), ']')
+function determine_inttype(l)
+    l <= 4 && return UInt8
+    l <= 8 && return UInt16
+    l <= 16 && return UInt32
+    l <= 32 && return UInt64
+    error("invalid length")
+end
+
+function KNucleotides{L, T}(str::String) where {L, T}
+    i = T(0)
+    @inbounds for j in 1:L
+        b = codeunit(str, j)
+        i = (i << 2) | NucleotideLUT[b]
+    end
+    return KNucleotides{L, T}(i)
+end
+
+@inline function shift(kn::KNucleotides{L, T}, c::UInt8) where {L, T}
+    i = kn.i
+    i &= (~(3 << 2(L-1)) % T)
+    KNucleotides{L, T}((i << T(2)) | @inbounds NucleotideLUT[c])
+end
+
+function Base.string(kn::KNucleotides{L}) where {L}
+    sprint() do io
+        for j in 1:L
+            mask = 3 << (2(L-j))
+            z = (kn.i & mask) >> 2(L-j)
+            write(io,
+                z == 0 ? 'A' :
+                z == 1 ? 'C' :
+                z == 2 ? 'G' :
+                z == 3 ? 'T' :
+                error())
+        end
+    end
+end
+
+function count_data(data::String, ::Type{KNucleotides{L, T}}) where {L, T}
+    counts = Dict{KNucleotides{L, T}, Int}()
+    kn = KNucleotides{L, T}(data)
+    counts[kn] = 1
+    @inbounds for offset = (L+1):sizeof(data)
+        c = codeunit(data, offset)
+        kn = shift(kn, c)
+        token = Base.ht_keyindex2!(counts, kn)
+        if token > 0
+            counts.vals[token] += 1
+        else
+            Base._setindex!(counts, 1, kn, -token)
+        end
+    end
+    return counts
+end
+
+function count_one(data::String, s::String)
+    L = length(s)
+    K = KNucleotides{L, determine_inttype(L)}
+    d = count_data(data, K)
+    return get(d, K(s), 0)
+end
+
+struct KNuc
+    name::String
+    count::Int
+end
+
+# sort down
+function Base.isless(x::KNuc, y::KNuc)
+    if x.count == y.count
+        return x.name > y.name
+    end
+    x.count > y.count
+end
+
+function sorted_array(m)
+    kn = Vector{KNuc}(undef, length(m))
+    i = 1
+    for (k, v) in m
+        kn[i] = KNuc(string(k), v)
+        i += 1
+    end
+    sort!(kn)
+end
+
+do_work(str::String, i::Int) = sorted_array(count_data(str, KNucleotides{i, determine_inttype(i)}))
+do_work(str::String, i::String) = count_one(str, i)
+
+function print_knucs(io, a::Array{KNuc, 1})
+    sum = 0
+    for kn in a
+        sum += kn.count
+    end
+    for kn in a
+        @printf(io, "%s %.3f\n", kn.name, 100.0kn.count/sum)
+    end
+    println(io)
+end
+
+function perf_k_nucleotide(io = stdin, output = stdout)
+    three = ">THREE "
+    while true
+        line = readline(io)
+        if startswith(line, three)
+            break
+        end
+    end
+    str = sprint() do sio
+        foreach(line-> write(sio, line), eachline(io))
+    end
+    vs = (1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT")
+    results = Vector{Any}(undef, length(vs))
+    Threads.@threads for i in length(vs):-1:1
+        results[i] = do_work(str, vs[i])
+    end
+
+    for (v, result) in zip(vs, results)
+        if result isa Array
+            print_knucs(output, result)
+        end
+        if result isa Int
+            @printf(output, "%d\t%s\n", result, v)
+        end
+    end
+end
+
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    perf_k_nucleotide()
+    return 0
+end
+
+open(joinpath(@__DIR__, "knucleotide-input.txt")) do io
+    perf_k_nucleotide(io, IOBuffer())
+end
diff --git a/knucleotide/knucleotide-fast.jl b/knucleotide/knucleotide-fast.jl
index 12df400..fa93d84 100644
--- a/knucleotide/knucleotide-fast.jl
+++ b/knucleotide/knucleotide-fast.jl
@@ -10,9 +10,6 @@
 using Distributed
 using Printf
 
-addprocs(4)
-
-@everywhere begin
 const NucleotideLUT = zeros(UInt8, 256)
 NucleotideLUT['A'%UInt8] = 0
 NucleotideLUT['C'%UInt8] = 1
@@ -117,8 +114,6 @@ end
 do_work(str::String, i::Int) = sorted_array(count_data(str, KNucleotides{i,determine_inttype(i)}))
 do_work(str::String, i::String) = count_one(str, i)
 
-end # @everywhere
-
 function print_knucs(a::Array{KNuc, 1})
     sum = 0
     for kn in a
@@ -141,16 +136,10 @@ function perf_k_nucleotide(io = stdin)
     data = read(io, String)
     str = filter(!isequal('\n'), data)
 
-    vs = [1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT"]
+    vs = (1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT")
     results = Vector{Any}(undef, length(vs))
-    order = collect(enumerate(vs))
-
-    @sync for w in workers()
-        @async while !isempty(order)
-            v = pop!(order)
-            r = remotecall_fetch(do_work, w, str, v[2])
-            results[v[1]] = r
-        end
+    for (i, v) in enumerate(vs)
+        results[i] = do_work(w, str, v[2])
     end
 
     for (v, result) in zip(vs, results)
diff --git a/mandelbrot/cmain.jl b/mandelbrot/cmain.jl
new file mode 100644
index 0000000..e814b6a
--- /dev/null
+++ b/mandelbrot/cmain.jl
@@ -0,0 +1,80 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+ direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn:
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html
+ modified for Julia 1.0 by Simon Danisch
+=#
+const zerov8 = ntuple(x-> 0f0, 8)
+
+@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci)
+    Zi = 2f0 .* Zr .* Zi .+ ci
+    Zr = Tr .- Ti .+ cr
+    Tr = Zr .* Zr
+    Ti = Zi .* Zi
+    return Zr,Zi,Tr,Ti
+end
+
+# Calculate mandelbrot set for one Vec8 into one byte
+Base.@propagate_inbounds function mand8(cr, ci)
+    Zr = zerov8
+    Zi = zerov8
+    Tr = zerov8
+    Ti = zerov8
+    t = zerov8
+    i = 0
+
+    while i<50
+        for _ in 1:5
+            Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci)
+            i += 1
+        end
+        t = Tr .+ Ti
+        all(x-> x > 4f0, t) && (return 0x00)
+    end
+    byte = 0xff
+    t[1] <= 4.0 || (byte &= 0b01111111)
+    t[2] <= 4.0 || (byte &= 0b10111111)
+    t[3] <= 4.0 || (byte &= 0b11011111)
+    t[4] <= 4.0 || (byte &= 0b11101111)
+    t[5] <= 4.0 || (byte &= 0b11110111)
+    t[6] <= 4.0 || (byte &= 0b11111011)
+    t[7] <= 4.0 || (byte &= 0b11111101)
+    t[8] <= 4.0 || (byte &= 0b11111110)
+    return byte
+end
+
+function mandel_inner(rows, ci, y, N, xvals)
+    @simd for x in 1:8:N
+        @inbounds begin
+            cr = ntuple(i-> xvals[x + (i - 1)], 8)
+            rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci)
+        end
+    end
+end
+
+function mandelbrot(n = 200, io = stdout)
+    inv_ = 2.0 / n
+    N = n
+    xvals = zeros(Float32, n)
+    yvals = zeros(Float32, n)
+    Threads.@threads for i in 0:(N-1)
+        @inbounds xvals[i + 1] = i * inv_ - 1.5
+        @inbounds yvals[i + 1] = i * inv_ - 1.0
+    end
+    rows = zeros(UInt8, n*N÷8)
+    Threads.@threads for y in 1:N
+        @inbounds ci = yvals[y]
+        mandel_inner(rows, ci, y, N, xvals)
+    end
+    write(io, "P4\n$n $n\n")
+    write(io, rows)
+end
+
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    n = parse(Int, ARGS[1])
+    mandelbrot(n, stdout)
+    return 0
+end
+
+julia_main(["8"])
diff --git a/nbody/cmain.jl b/nbody/cmain.jl
new file mode 100644
index 0000000..d58b226
--- /dev/null
+++ b/nbody/cmain.jl
@@ -0,0 +1,137 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+# based on the Java version
+
+module NBody
+
+using Printf
+using LinearAlgebra
+
+# Constants
+const solar_mass = 4π^2
+const days_per_year = 365.24
+
+# Use a 4-tuple here to get better SIMD instructions.
+# This is a space-time tradeoff, but this benchmark is well within L1 cache limits.
+struct Vec3
+    x::NTuple{4, Float64}
+end
+Vec3(x, y, z) = Vec3((x,y,z,0.0))
+Base.:/(v::Vec3, n::Number) = Vec3(1/n .* v.x)
+Base.:*(v::Vec3, n::Number) = Vec3(n .* v.x)
+Base.:-(v1::Vec3, v2::Vec3) = Vec3(v1.x .- v2.x)
+Base.:+(v1::Vec3, v2::Vec3) = Vec3(v1.x .+ v2.x)
+# Todo, prettify
+squarednorm(v1::Vec3) = v1.x[1]^2 + v1.x[2]^2 + v1.x[3]^2
+Base.muladd(x::Vec3, y::Number, z::Vec3) = Vec3(muladd.(x.x, y, z.x))
+
+# A heavenly body in the system
+mutable struct Body
+    pos::Vec3
+    v::Vec3
+    mass::Float64
+end
+
+function offset_momentum!(b::Body, p::Vec3)
+    b.v -= p / solar_mass
+end
+
+function init_sun!(bodies::Vector{Body})
+    p = Vec3(0.0, 0.0, 0.0)
+    for b in bodies
+        p += b.v * b.mass
+    end
+    offset_momentum!(bodies[1], p)
+end
+
+function advance(bodies::Vector{Body}, dt::Number)
+    @inbounds for i = 1:length(bodies)
+        bi = bodies[i]
+        for j = i+1:length(bodies)
+            bj = bodies[j]
+            delta = bi.pos - bj.pos
+            dsq = squarednorm(delta)
+            distance = sqrt(dsq)
+            mag = dt / (dsq * distance)
+            bi.v = muladd(delta, -(bj.mass * mag), bi.v)
+            bj.v = muladd(delta, (bi.mass * mag), bj.v)
+        end
+    end
+
+    for b in bodies
+        b.pos = muladd(b.v, dt, b.pos)
+    end
+end
+
+function energy(bodies::Vector{Body})
+    e = 0.0
+    @inbounds for i = 1:length(bodies)
+        bi = bodies[i]
+        e += 0.5 * bi.mass * squarednorm(bi.v)
+        for j = i+1:length(bodies)
+            bj = bodies[j]
+            delta = bi.pos - bj.pos
+            distance = sqrt(squarednorm(delta))
+            dinv = 1.0 / distance
+            e = muladd((bi.mass * bj.mass), -dinv, e)
+        end
+    end
+    return e
+end
+
+
+function perf_nbody(N::Int=1000)
+    jupiter = Body( Vec3(4.84143144246472090e+00,                  # pos[1] = x
+                         -1.16032004402742839e+00,                 # pos[2] = y
+                         -1.03622044471123109e-01),                # pos[3] = z
+                   Vec3(1.66007664274403694e-03 * days_per_year,   # v[1] = vx
+                        7.69901118419740425e-03 * days_per_year,   # v[2] = vy
+                        -6.90460016972063023e-05 * days_per_year), # v[3] = vz
+                   9.54791938424326609e-04 * solar_mass)       # mass
+
+    saturn = Body(Vec3(8.34336671824457987e+00,
+                       4.12479856412430479e+00,
+                      -4.03523417114321381e-01),
+                  Vec3(-2.76742510726862411e-03 * days_per_year,
+                       4.99852801234917238e-03 * days_per_year,
+                       2.30417297573763929e-05 * days_per_year),
+                  2.85885980666130812e-04 * solar_mass)
+
+    uranus = Body(Vec3(1.28943695621391310e+01,
+                    -1.51111514016986312e+01,
+                    -2.23307578892655734e-01),
+                  Vec3(2.96460137564761618e-03 * days_per_year,
+                       2.37847173959480950e-03 * days_per_year,
+                       -2.96589568540237556e-05 * days_per_year),
+                  4.36624404335156298e-05 * solar_mass)
+
+    neptune = Body(Vec3(1.53796971148509165e+01,
+                        -2.59193146099879641e+01,
+                        1.79258772950371181e-01),
+                   Vec3(2.68067772490389322e-03 * days_per_year,
+                        1.62824170038242295e-03 * days_per_year,
+                       -9.51592254519715870e-05 * days_per_year),
+                   5.15138902046611451e-05 * solar_mass)
+
+    sun = Body(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 0.0), solar_mass)
+
+    bodies = [sun, jupiter, saturn, uranus, neptune]
+
+    init_sun!(bodies)
+    @printf("%.9f\n", energy(bodies))
+    for i = 1:N
+        advance(bodies, 0.01)
+    end
+    @printf("%.9f\n", energy(bodies))
+end
+
+end
+
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    n = parse(Int,ARGS[1])
+    NBody.perf_nbody(n)
+    return 0
+end
+
+julia_main(["50000"])
diff --git a/pidigits/cmain.jl b/pidigits/cmain.jl
new file mode 100644
index 0000000..49bbe1a
--- /dev/null
+++ b/pidigits/cmain.jl
@@ -0,0 +1,76 @@
+#=
+The Computer Language Benchmarks Game
+ https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+ based on the C++ program of Alessandro Power
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/pidigits-gpp-4.html
+ with ideas borrowed from the C program of Mr Ledrug
+ https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/pidigits-gcc-1.html
+ modified for Julia 1.0 by Luiz M. Faria
+=#
+
+# import a few functions from GMP wrapper to do in-place operations
+using Base.GMP.MPZ: add_ui!, mul_ui!, add!, tdiv_q!
+# and add a few more wrappers for GMP functions not currently in Base
+const mpz_t = Ref{BigInt}
+addmul_ui!(x::BigInt,a::BigInt,b) = ccall((:__gmpz_addmul_ui,"libgmp"), Cvoid, (mpz_t,mpz_t,Culong), x, a, b)
+submul_ui!(x::BigInt,a::BigInt,b) = ccall((:__gmpz_submul_ui,"libgmp"), Cvoid, (mpz_t,mpz_t,Culong), x, a, b)
+get_ui(x::BigInt) = ccall((:__gmpz_get_ui,"libgmp"), Culong, (mpz_t,), x)
+
+function next!(q,r,t,k)
+    mult  = 2*k + 1
+    addmul_ui!(r,q,2)
+    mul_ui!(r,mult)
+    mul_ui!(t,mult)
+    mul_ui!(q,k)
+    return nothing
+end
+
+function extract!(q,r,t,tmp1,tmp2,n)
+    mul_ui!(tmp1,q,n)
+    add!(tmp1,r)
+    tdiv_q!(tmp2,tmp1,t)
+    return get_ui(tmp2)
+end
+
+function produce!(q,r,t,n)
+    mul_ui!(q,10)
+    submul_ui!(r,t,n)
+    mul_ui!(r,10)
+    return nothing
+end
+
+function pidigits(n::Int)
+    q,r,t,tmp1,tmp2 = map(BigInt,(1,0,1,0,0))
+    k = UInt(0)
+    n_digits=0
+
+    while n_digits<n
+        i=0
+        while (i<10 && n_digits<n)
+            k +=1
+            next!(q,r,t,k)
+            q>r && continue
+
+            digit = extract!(q,r,t,tmp1,tmp2,3)
+            if digit == extract!(q,r,t,tmp1,tmp2,4)
+                print(digit)
+                produce!(q,r,t,digit)
+                i +=1
+                n_digits+=1
+            end
+        end
+        while i < 10
+            i+=1
+            print(' ')
+        end
+        print("\t:$n_digits\n")
+    end
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+
+    n = parse(Int,ARGS[1])
+    pidigits(n)
+    return 0
+end
+
+julia_main(["10000"])
diff --git a/regexredux/cmain.jl b/regexredux/cmain.jl
new file mode 100644
index 0000000..0521439
--- /dev/null
+++ b/regexredux/cmain.jl
@@ -0,0 +1,61 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+#
+# contributed by Daniel Jones
+# fixed by David Campbell
+# modified by Jarrett Revels, Alex Arslan, Yichao Yu
+
+using Printf
+
+const variants = [
+      "agggtaaa|tttaccct",
+      "[cgt]gggtaaa|tttaccc[acg]",
+      "a[act]ggtaaa|tttacc[agt]t",
+      "ag[act]gtaaa|tttac[agt]ct",
+      "agg[act]taaa|ttta[agt]cct",
+      "aggg[acg]aaa|ttt[cgt]ccct",
+      "agggt[cgt]aa|tt[acg]accct",
+      "agggta[cgt]a|t[acg]taccct",
+      "agggtaa[cgt]|[acg]ttaccct"
+]
+
+const subs = [
+    (r"tHa[Nt]", "<4>"),
+    (r"aND|caN|Ha[DS]|WaS", "<3>"),
+    (r"a[NSt]|BY", "<2>"),
+    (r"<[^>]*>", "|"),
+    (r"\|[^|][^|]*\|", "-")
+]
+
+function perf_regex_dna(io = stdin)
+    seq = read(io, String)
+    l1 = length(seq)
+
+    seq = replace(seq, r">.*\n|\n" => "")
+    l2 = length(seq)
+
+    for v in variants
+        k = 0
+        for m in eachmatch(Regex(v), seq)
+            k += 1
+        end
+        @printf("%s %d\n", v, k)
+    end
+
+    for (u, v) in subs
+        seq = replace(seq, u => v)
+    end
+
+    println()
+    println(l1 + 1)  # why + 1??
+    println(l2)
+    println(length(seq))
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    perf_regex_dna()
+    return 0
+end
+
+open(joinpath(@__DIR__, "regexredux-input.txt")) do io
+    perf_regex_dna(io)
+end
diff --git a/regexredux/regexredux.jl b/regexredux/regexredux.jl
index efe8497..3179ad0 100644
--- a/regexredux/regexredux.jl
+++ b/regexredux/regexredux.jl
@@ -7,7 +7,7 @@
 
 using Printf
 
-const variants = [
+const variants = Regex.([
       "agggtaaa|tttaccct",
       "[cgt]gggtaaa|tttaccc[acg]",
       "a[act]ggtaaa|tttacc[agt]t",
@@ -17,39 +17,81 @@ const variants = [
       "agggt[cgt]aa|tt[acg]accct",
       "agggta[cgt]a|t[acg]taccct",
       "agggtaa[cgt]|[acg]ttaccct"
-]
+])
 
 const subs = [
-    (r"tHa[Nt]", "<4>"),
-    (r"aND|caN|Ha[DS]|WaS", "<3>"),
-    (r"a[NSt]|BY", "<2>"),
-    (r"<[^>]*>", "|"),
-    (r"\|[^|][^|]*\|", "-")
+    (r"tHa[Nt]" => "<4>"),
+    (r"aND|caN|Ha[DS]|WaS" => "<3>"),
+    (r"a[NSt]|BY" => "<2>"),
+    (r"<[^>]*>" => "|"),
+    (r"\|[^|][^|]*\|" => "-")
 ]
 
-function perf_regex_dna()
-    seq = read(stdin, String)
-    l1 = length(seq)
+function replace_fast!(out::IO, str::AbstractString, pat_repl::Pair)
+    pattern, repl = pat_repl
+    n = 1
+    e = lastindex(str)
+    i = a = firstindex(str)
+    r = something(findnext(pattern,str,i), 0)
+    j, k = first(r), last(r)
+    while j != 0
+        if i == a || i <= k
+            Base.unsafe_write(out, pointer(str, i), UInt(j - i))
+            Base._replace(out, repl, str, r, pattern)
+        end
+        if k < j
+            i = j
+            j > e && break
+            k = nextind(str, j)
+        else
+            i = k = nextind(str, k)
+        end
+        r = something(findnext(pattern,str,k), 0)
+        r == 0:-1 && break
+        j, k = first(r), last(r)
+        n += 1
+    end
+    write(out, SubString(str, i))
+end
 
+function inner(subs, res)
+    for i in 2:length(subs)
+        res = replace(res, subs[i])
+    end
+    res
+end
+function perf_regex_dna(io = stdin, output = stdout)
+    N = Threads.nthreads()
+    seq = read(io, String)
+    l1 = length(seq)
     seq = replace(seq, r">.*\n|\n" => "")
     l2 = length(seq)
-
-    for v in variants
-        k = 0
-        for m in eachmatch(Regex(v), seq)
+    res = Vector{Tuple{Int, Int}}(undef, N)
+    Threads.@threads for i in 1:length(variants)
+        k = 0; v = variants[i]
+        for m in eachmatch(v, seq)
             k += 1
         end
-        @printf("%s %d\n", v, k)
+        @inbounds res[Threads.threadid()] = (i, k)
     end
-
-    for (u, v) in subs
-        seq = replace(seq, u => v)
+    nchunk = length(seq) ÷ N
+    results = zeros(N)
+    for id in 1:N
+        chunk = SubString(seq, (id - 1) * nchunk + 1, min(id * nchunk, length(seq)))
+        resio = IOBuffer(sizehint = nchunk)
+        for sub in subs
+            replace_fast!(resio, String(take!(resio)), sub)
+        end
+        results[id] += resio.size
     end
 
-    println()
-    println(l1 + 1)  # why + 1??
-    println(l2)
-    println(length(seq))
+    println(output)
+    println(output, l1 + 1)  # why + 1??
+    println(output, l2)
+    println(output, length(seq))
 end
 
-perf_regex_dna()
+# perf_regex_dna()
+cd(@__DIR__)
+@time perf_regex_dna("../fasta.txt", IOBuffer());
+x = IOBuffer()
diff --git a/revcomp/cmain.jl b/revcomp/cmain.jl
new file mode 100644
index 0000000..dc8a3e8
--- /dev/null
+++ b/revcomp/cmain.jl
@@ -0,0 +1,104 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+#
+# contributed by David Campbell
+# modified by Jarrett Revels, Kristoffer Carlsson, Alex Arslan
+
+mutable struct PushVector{T, A<:AbstractVector{T}} <: AbstractVector{T}
+    v::A
+    l::Int
+end
+
+PushVector{T}() where {T} = PushVector(Vector{T}(undef, 60), 0)
+
+Base.IndexStyle(::Type{<:PushVector}) = IndexLinear()
+Base.length(v::PushVector) = v.l
+Base.size(v::PushVector) = (v.l,)
+@inline function Base.getindex(v::PushVector, i)
+    @boundscheck checkbounds(v, i)
+    @inbounds v.v[i]
+end
+
+function Base.push!(v::PushVector, i)
+    v.l += 1
+    if v.l > length(v.v)
+        resize!(v.v, v.l * 2)
+    end
+    v.v[v.l] = i
+    return v
+end
+
+function Base.resize!(v::PushVector, l::Integer)
+    # Only support shrinking for now, since that is all we need
+    @assert l <= v.l
+    v.l = l
+end
+
+const _revcompdata = Dict(
+   'A'=> 'T', 'a'=> 'T',
+   'C'=> 'G', 'c'=> 'G',
+   'G'=> 'C', 'g'=> 'C',
+   'T'=> 'A', 't'=> 'A',
+   'U'=> 'A', 'u'=> 'A',
+   'M'=> 'K', 'm'=> 'K',
+   'R'=> 'Y', 'r'=> 'Y',
+   'W'=> 'W', 'w'=> 'W',
+   'S'=> 'S', 's'=> 'S',
+   'Y'=> 'R', 'y'=> 'R',
+   'K'=> 'M', 'k'=> 'M',
+   'V'=> 'B', 'v'=> 'B',
+   'H'=> 'D', 'h'=> 'D',
+   'D'=> 'H', 'd'=> 'H',
+   'B'=> 'V', 'b'=> 'V',
+   'N'=> 'N', 'n'=> 'N',
+)
+
+const revcompdata = zeros(UInt8, 256)
+for (k, v) in _revcompdata
+    revcompdata[k%UInt8] = v%UInt8
+end
+
+function print_buff(outio, bb)
+    b = resize!(bb.v, length(bb))
+    l = length(b)
+    length(b) == 0 && return
+
+    br = reverse!(b)
+    for i = 1:60:l
+        if i+59 > l
+            write(outio, @view br[i:end])
+        else
+            write(outio, @view br[i:i+59])
+        end
+        write(outio, '\n')
+    end
+end
+
+function perf_revcomp(io=stdin, outio = stdout)
+    buff = PushVector{UInt8}()
+    while true
+        line = codeunits(readline(io))
+        if isempty(line)
+            print_buff(outio, buff)
+            return
+        elseif first(line) == UInt8('>')
+            print_buff(outio, buff)
+            resize!(buff, 0)
+            write(outio, line)
+            write(outio, '\n')
+        else
+            l = length(line)
+            @inbounds for c in line
+                push!(buff, revcompdata[c%Int])
+            end
+        end
+    end
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+    perf_revcomp()
+    return 0
+end
+
+open(joinpath(@__DIR__, "revcomp-input.txt")) do io
+    perf_revcomp(io, IOBuffer())
+end
diff --git a/spectralnorm/cmain.jl b/spectralnorm/cmain.jl
new file mode 100644
index 0000000..e6cc687
--- /dev/null
+++ b/spectralnorm/cmain.jl
@@ -0,0 +1,61 @@
+# The Computer Language Benchmarks Game
+# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
+
+# based on the Javascript program
+# optimizations by Kristoffer Carlsson
+
+using Printf
+
+A(i,j) = 1.0 / ( (((i+j)*(i+j+1)) >> 1) + i+1)
+
+function Au!(w, u)
+    n = length(u)
+    Threads.@threads for i = 1:n
+        w[i] = 0
+        z = 0.0
+        @simd for j = 1:n
+           @inbounds z += A(i-1, j-1) * u[j]
+        end
+        w[i] = z
+    end
+end
+
+function Atu!(v, w)
+    n = length(w)
+    Threads.@threads for i = 1:n
+        z = 0.0
+        @simd for j = 1:n
+           @inbounds z += A(j-1,i-1) * w[j]
+        end
+        v[i] = z
+    end
+end
+
+function AtAu!(w, v, u)
+    Au!(w, u)
+    Atu!(v, w)
+end
+
+function perf_spectralnorm(n::Int=100)
+    u = ones(Float64, n)
+    v = zeros(Float64 ,n)
+    w = zeros(Float64, n)
+    vv = vBv = 0
+    for i = 1:10
+        AtAu!(w, v, u)
+        AtAu!(w, u, v)
+    end
+    for i = 1:n
+        vBv += u[i]*v[i]
+        vv += v[i]*v[i]
+    end
+    return sqrt(vBv/vv)
+end
+Base.@ccallable function julia_main(ARGS::Vector{String})::Cint
+
+    n = parse(Int,ARGS[1])
+    @printf("%.9f\n", perf_spectralnorm(n))
+    return 0
+end
+
+julia_main(["5500"])