diff --git a/binarytrees/binarytree-fast.jl b/binarytrees/binarytrees-fast.jl similarity index 100% rename from binarytrees/binarytree-fast.jl rename to binarytrees/binarytrees-fast.jl diff --git a/binarytrees/cmain.jl b/binarytrees/cmain.jl new file mode 100644 index 0000000..2334a11 --- /dev/null +++ b/binarytrees/cmain.jl @@ -0,0 +1,65 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +# contributed by Simon Danisch +# based on the [C++ implementation](https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/binarytrees-gpp-9.html) + +using Printf +struct Node + left::UInt32 + right::UInt32 +end +function alloc!(pool, left, right) + push!(pool, Node(left, right)) + return length(pool) +end +function make(pool, d) + d == 0 && return 0 + alloc!(pool, make(pool, d - 1), make(pool, d - 1)) +end +check(pool, t::Node) = 1 + check(pool, t.left) + check(pool, t.right) +function check(pool, node::Integer) + node == 0 && return 1 + @inbounds return check(pool, pool[node]) +end +function threads_inner(pool, d, min_depth, max_depth) + niter = 1 << (max_depth - d + min_depth) + c = 0 + for j = 1:niter + c += check(pool, make(pool, d)) + empty!(pool) + end + return (niter, d, c) +end + +function loop_depths(io, d, min_depth, max_depth, ::Val{N}) where N + threadstore = ntuple(x-> NTuple{3, Int}[], N) + Threads.@threads for d in min_depth:2:max_depth + pool = Node[] + push!(threadstore[Threads.threadid()], threads_inner(pool, d, min_depth, max_depth)) + end + for results in threadstore, result in results + @printf(io, "%i\t trees of depth %i\t check: %i\n", result...) + end +end +function perf_binary_trees(io, N::Int=10) + min_depth = 4 + max_depth = N + stretch_depth = max_depth + 1 + pool = Node[] + # create and check stretch tree + c = check(pool, make(pool, stretch_depth)) + @printf(io, "stretch tree of depth %i\t check: %i\n", stretch_depth, c) + + long_lived_tree = make(pool, max_depth) + + loop_depths(io, min_depth, min_depth, max_depth, Val(Threads.nthreads())) + @printf(io, "long lived tree of depth %i\t check: %i\n", max_depth, check(pool, long_lived_tree)) +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + + perf_binary_trees(stdout, parse(Int, ARGS[1])) + return 0 +end + +julia_main(["21"]) diff --git a/c_vs_julia.jl b/c_vs_julia.jl new file mode 100644 index 0000000..4549e5f --- /dev/null +++ b/c_vs_julia.jl @@ -0,0 +1,36 @@ +cd(@__DIR__) +fasta_input = "fasta.txt" +fasta_gen = joinpath("fasta", "fasta.jl") +run(pipeline(`$(Base.julia_cmd()) $fasta_gen 25000000` ;stdout = fasta_input)) + +benchmarks = [ + ("binarytrees", 21), + ("fannkuchredux", 12), + ("fasta", 25000000), + ("knucleotide", fasta_input), + ("mandelbrot", 16000), + ("nbody", 50000000), + ("pidigits", 10000), + ("regexredux", fasta_input), + ("revcomp", fasta_input), + ("spectralnorm", 5500), +] + +timings = map(benchmarks) do (bench, arg) + println(bench) + isfile("result.bin") && rm("result.bin") + args = [:stdout => "result.bin"] + argcmd = `` + cmd = if arg isa String + @assert isfile(arg) + push!(args, :stdin => arg) + else + argcmd = `$arg` + end + jltime = withenv("JULIA_NUM_THREADS" => 16) do + exe = joinpath("jl_build", bench, "cmain") + @elapsed run(pipeline(`$() $argcmd`; args...)) + end + ctime = @elapsed run(pipeline(`./$bench $argcmd`; args...)) + (julia = jltime, cpp = ctime) +end diff --git a/cpp_src/.gitignore b/cpp_src/.gitignore new file mode 100644 index 0000000..4537490 --- /dev/null +++ b/cpp_src/.gitignore @@ -0,0 +1,11 @@ +binarytrees +fannkuchredux +fasta +knucleotide +mandelbrot +nbody +pidigits +regexredux +revcomp +spectralnorm +*.o diff --git a/cpp_src/binarytrees.c++ b/cpp_src/binarytrees.c++ new file mode 100644 index 0000000..5e5ed28 --- /dev/null +++ b/cpp_src/binarytrees.c++ @@ -0,0 +1,139 @@ +/* The Computer Language Benchmarks Game + * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + * + * contributed by Jon Harrop + * modified by Alex Mizrahi + * modified by Andreas Schäfer + * very minor omp tweak by The Anh Tran + * modified to use apr_pools by Dave Compton + * *reset* + */ + +#include +#include +#include +#include + + +const size_t LINE_SIZE = 64; + +class Apr +{ +public: + Apr() + { + apr_initialize(); + } + + ~Apr() + { + apr_terminate(); + } +}; + +struct Node +{ + Node *l, *r; + + int check() const + { + if (l) + return l->check() + 1 + r->check(); + else return 1; + } +}; + +class NodePool +{ +public: + NodePool() + { + apr_pool_create_unmanaged(&pool); + } + + ~NodePool() + { + apr_pool_destroy(pool); + } + + Node* alloc() + { + return (Node *)apr_palloc(pool, sizeof(Node)); + } + + void clear() + { + apr_pool_clear(pool); + } + +private: + apr_pool_t* pool; +}; + +Node *make(int d, NodePool &store) +{ + Node* root = store.alloc(); + + if(d>0){ + root->l=make(d-1, store); + root->r=make(d-1, store); + }else{ + root->l=root->r=0; + } + + return root; +} + +int main(int argc, char *argv[]) +{ + Apr apr; + int min_depth = 4; + int max_depth = std::max(min_depth+2, + (argc == 2 ? atoi(argv[1]) : 10)); + int stretch_depth = max_depth+1; + + // Alloc then dealloc stretchdepth tree + { + NodePool store; + Node *c = make(stretch_depth, store); + std::cout << "stretch tree of depth " << stretch_depth << "\t " + << "check: " << c->check() << std::endl; + } + + NodePool long_lived_store; + Node *long_lived_tree = make(max_depth, long_lived_store); + + // buffer to store output of each thread + char *outputstr = (char*)malloc(LINE_SIZE * (max_depth +1) * sizeof(char)); + + #pragma omp parallel for + for (int d = min_depth; d <= max_depth; d += 2) + { + int iterations = 1 << (max_depth - d + min_depth); + int c = 0; + + // Create a memory pool for this thread to use. + NodePool store; + + for (int i = 1; i <= iterations; ++i) + { + Node *a = make(d, store); + c += a->check(); + store.clear(); + } + + // each thread write to separate location + sprintf(outputstr + LINE_SIZE * d, "%d\t trees of depth %d\t check: %d\n", + iterations, d, c); + } + + // print all results + for (int d = min_depth; d <= max_depth; d += 2) + printf("%s", outputstr + (d * LINE_SIZE) ); + free(outputstr); + + std::cout << "long lived tree of depth " << max_depth << "\t " + << "check: " << (long_lived_tree->check()) << "\n"; + + return 0; +} diff --git a/cpp_src/fannkuchredux.c++ b/cpp_src/fannkuchredux.c++ new file mode 100644 index 0000000..301e6e6 --- /dev/null +++ b/cpp_src/fannkuchredux.c++ @@ -0,0 +1,198 @@ +// The Computer Language Benchmarks Game +// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +// +// Contributed by Dave Compton +// Based on "fannkuch-redux C gcc #5", contributed by Jeremy Zerfas +// which in turn was based on the Ada program by Jonathan Parker and +// Georg Bauhaus which in turn was based on code by Dave Fladebo, +// Eckehard Berns, Heiner Marxen, Hongwei Xi, and The Anh Tran and +// also the Java program by Oleg Mazurov. + +#include +#include +#include + +using namespace std; + +static int64_t fact[32]; + +void initializeFact(int n) +{ + fact[0] = 1; + for (auto i = 1; i <= n; ++i) + fact[i] = i * fact[i - 1]; +} + +class Permutation +{ + public: + Permutation(int n, int64_t start); + void advance(); + int64_t countFlips() const; + + private: + vector count; + vector current; + +}; + +// +// Initialize the current value of a permutation +// and the cycle count values used to advance . +// +Permutation::Permutation(int n, int64_t start) +{ + count.resize(n); + current.resize(n); + + // Initialize count + for (auto i = n - 1; i >= 0; --i) + { + auto d = start / fact[i]; + start = start % fact[i]; + count[i] = d; + } + + // Initialize current. + for (auto i = 0; i < n; ++i) + current[i] = i; + + for (auto i = n - 1; i >= 0; --i) + { + auto d = count[i]; + auto b = current.begin(); + rotate(b, b + d, b + i + 1); + } +} + +// +// Advance the current permutation to the next in sequence. +// +void Permutation::advance() +{ + for (auto i = 1; ;++i) + { + // Tried using std::rotate here but that was slower. + auto first = current[0]; + for (auto j = 0; j < i; ++j) + current[j] = current[j + 1]; + current[i] = first; + + ++(count[i]); + if (count[i] <= i) + break; + count[i] = 0; + } +} + +// +// Count the flips required to flip 0 to the front of the vector. +// +// Other than minor cosmetic changes, the following routine is +// basically lifted from "fannkuch-redux C gcc #5" +// +inline int64_t Permutation::countFlips() const +{ + const auto n = current.size(); + auto flips = 0; + auto first = current[0]; + if (first > 0) + { + flips = 1; + + int8_t temp[n]; + // Make a copy of current to work on. + for (size_t i = 0; i < n; ++i) + temp[i] = current[i]; + + + // Flip temp until the element at the first index is 0 + for (; temp[first] > 0; ++flips) + { + // Record the newFirst and restore the old + // first at its new flipped position. + const int8_t newFirst = temp[first]; + temp[first] = first; + + if (first > 2) + { + int64_t low = 1, high = first - 1; + do + { + swap(temp[low], temp[high]); + if (!(low + 3 <= high && low < 16)) + break; + ++low; + --high; + } while (1); + } + // Update first to newFirst that we recorded earlier. + first = newFirst; + } + } + return flips; +} + +int main(int argc, char **argv) +{ + const auto n = atoi(argv[1]); + + // Compute some factorials for later use. + initializeFact(n); + + // blockCount works best if it is set to a multiple of the number + // of CPUs so that the same number of blocks gets distributed to + // each cpu. The computer used for development (Intel i7-4700MQ) + // had 8 "CPU"s (4 cores with hyperthreading) so 8, 16 and 24 + // all worked well. + + auto blockCount = 24; + if (blockCount > fact[n]) + blockCount = 1; + const int64_t blockLength = fact[n] / blockCount; + + int64_t maxFlips = 0, checksum = 0; + + // Iterate over each block. + #pragma omp parallel for \ + reduction(max:maxFlips) \ + reduction(+:checksum) + + for (int64_t blockStart = 0; + blockStart < fact[n]; + blockStart += blockLength) + { + // first permutation for this block. + Permutation permutation(n, blockStart); + + // Iterate over each permutation in the block. + auto index = blockStart; + while (1) + { + const auto flips = permutation.countFlips(); + + if (flips) + { + if (index % 2 == 0) + checksum += flips; + else + checksum -= flips; + + if (flips > maxFlips) + maxFlips = flips; + } + + if (++index == blockStart + blockLength) + break; + + // next permutation for this block. + permutation.advance(); + } + } + + // Output the results to stdout. + cout << checksum << endl; + cout << "Pfannkuchen(" << n << ") = " << maxFlips << endl; + + return 0; +} diff --git a/cpp_src/fasta.c++ b/cpp_src/fasta.c++ new file mode 100644 index 0000000..3587583 --- /dev/null +++ b/cpp_src/fasta.c++ @@ -0,0 +1,291 @@ +/* The Computer Language Benchmarks Game +https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +contributed by Sylvester Saguban +inspired from C++ G++ #7 from Rafal Rusin and contributors +- uses more features of C++17 and TS concept's 'auto' function parameters +- more efficient multi-threading, which sadly is the reason why this + this has more code than other implementations. +- data structure from array of struct to struct of arrays for better + cache locality and higher chance of vectorization + +compile with g++ 7.2 +with these flags -std=c++17 -march=native -O3 -msse -msse2 -msse3 +*/ + +#include +#include +#include +#include +#include +#include +#include + +struct Config +{ + static constexpr unsigned max_threads = 8; + static constexpr unsigned min_threads = 1; + static constexpr unsigned chars_per_line = 60; + static constexpr unsigned max_proc = 4; + static constexpr unsigned min_thread_work = (chars_per_line + 1) * 84; + static constexpr unsigned max_thread_buff = (chars_per_line + 1) * 1024; + + static constexpr unsigned im = 139968, ia = 3877, ic = 29573; + static constexpr float reciprocal = 1.0f / im; + static inline FILE* output = stdout; +}; + +class ThreadMgr +{ + struct Proc { + std::atomic once; + std::mutex mtx; + unsigned index, index_reset; + }; + std::atomic _out_sequence = 0; + std::mutex _gen_mutex, _out_mutex; + std::condition_variable _cv; + Proc _proc[ Config::max_proc ]; + unsigned _id_ctr = 0, _threads = 1, _proc_ctr = 0; +public: + struct Context { + unsigned index; + union { + alignas(16) std::array< int, Config::max_thread_buff > ibuff; + alignas(16) std::array< char, Config::max_thread_buff > cbuff; + }; + }; + + void SetThreadCount( unsigned count ) { + _threads = std::clamp( count, Config::min_threads, + Config::max_threads ); + } + + inline void SyncOutput( unsigned index, const auto& function ) { + std::unique_lock lock(_out_mutex); + _cv.wait( lock, [=]{ return index == _out_sequence; }); + function(); + ++_out_sequence; + _cv.notify_all(); + } + + auto Make( const std::string_view& text, unsigned size, auto output ) { + unsigned id = _id_ctr++; + auto& proc = _proc[ _proc_ctr++ ]; + return [=, &proc]( ThreadMgr::Context& tdata ) { + if( proc.once.exchange(false) ) + this->SyncOutput( id, [&]{ + std::fwrite( text.data(), 1, text.size(), Config::output ); + output( size, tdata ); + }); + }; + } + + auto Make( const std::string_view& text, unsigned size, + auto generate, auto convert, auto output ) { + unsigned work_count = std::clamp( size / _threads, + Config::min_thread_work, + Config::max_thread_buff ); + unsigned id = _id_ctr++; + unsigned limit = id + (size / work_count) + (size % work_count != 0); + _id_ctr = limit + 1; + auto& proc = _proc[ _proc_ctr++ ]; + proc.index_reset = id + 1; + + return [=, &proc]( ThreadMgr::Context& tdata ) { + if( proc.once.exchange(false) ) + this->SyncOutput( id, [&] { + std::fwrite( text.data(), 1, text.size(), Config::output ); + }); + + unsigned start_index = id + 1; + while(true) { + unsigned cvsize; + { + std::scoped_lock lock(_gen_mutex); + { + std::scoped_lock lock(proc.mtx); + if( proc.index > limit ) + break; + tdata.index = proc.index++; + } + cvsize = std::min( work_count, size - + (((tdata.index - start_index) + * work_count)) ); + generate( cvsize, tdata ); + } + + unsigned char_count = convert( cvsize, start_index, + work_count, tdata ); + + this->SyncOutput( tdata.index, [&] { + if( tdata.index == limit && + tdata.cbuff[char_count-1] != '\n' ) + tdata.cbuff[char_count++] = '\n'; + output( char_count, tdata ); + }); + } + }; + } + + void RunSequence( auto... f ){ + static_assert( sizeof...(f) <= Config::max_proc, + "function chain is too large, " + "increase Config::max_proc." ); + std::thread threads[ Config::max_threads ]; + + _out_sequence = 0; + _proc_ctr = 0; + for( auto& i : _proc ) { + i.once = true; + i.index = i.index_reset; + } + + for( unsigned i = 0; i < _threads; ++i ) + threads[i] = std::thread( [=]{ + Context tdata = {0}; + (f( tdata ), ...); + }); + + for( unsigned i = 0; i < _threads; ++i ) + threads[i].join(); + } +} gdata; + +int main(int argc, char *argv[]) +{ + static constexpr char alu[] = + "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG" + "GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC" + "CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC" + "GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC" + "CGTCTCAAAAA"; + + static constexpr std::array< std::pair< float, char >, 15> iub = {{ + { 0.27f, 'a' }, { 0.12f, 'c' }, { 0.12f, 'g' }, { 0.27f, 't' }, + { 0.02f, 'B' }, { 0.02f, 'D' }, { 0.02f, 'H' }, { 0.02f, 'K' }, + { 0.02f, 'M' }, { 0.02f, 'N' }, { 0.02f, 'R' }, { 0.02f, 'S' }, + { 0.02f, 'V' }, { 0.02f, 'W' }, { 0.02f, 'Y' } + }}; + + static constexpr std::array< std::pair< float, char >, 4> homosapiens = {{ + { 0.3029549426680f, 'a' }, + { 0.1979883004921f, 'c' }, + { 0.1975473066391f, 'g' }, + { 0.3015094502008f, 't' } + }}; + + static auto make_commulative = []( const auto& p ) { + struct Ret { + alignas(16) std::array< float, p.size() > real; + alignas(16) std::array< char, p.size() > ch; + } ret{{p[0].first}, {p[0].second}}; + + for(unsigned i = 1; i < p.size(); ++i ) { + ret.real[i] = ret.real[i-1] + p[i].first; + ret.ch[i] = p[i].second; + } + return ret; + }; + + static auto get_random = [] { + static unsigned last = 42; + return (last = (last * Config::ia + Config::ic) % Config::im); + }; + + static auto convert = []( const auto& data, unsigned size, + unsigned start_index, unsigned max_work, + ThreadMgr::Context& tdata ) { + auto iitr = tdata.ibuff.begin(), iend = iitr + size; + auto citr = tdata.cbuff.begin(); + auto inner_loop = [&]( auto end ) { + while( iitr != end ) { + auto f = *iitr++ * Config::reciprocal; + auto i = std::find_if( data.real.begin(), + data.real.end(), + [f]( auto t ){ return f <= t; }); + *citr++ = data.ch[ i - data.real.begin() ]; + } + *citr++ = '\n'; + }; + + unsigned aligner_count = Config::chars_per_line - + ((max_work * (tdata.index - start_index)) + % Config::chars_per_line); + + if( aligner_count > 0 ) + inner_loop( iitr + aligner_count ); + + unsigned odd_count = (iend - iitr) % Config::chars_per_line; + auto iend_blocks = iend - odd_count; + while( iitr < iend_blocks ) + inner_loop( iitr + Config::chars_per_line ); + + if( odd_count ) { + inner_loop( iitr + odd_count ); + citr -= 1; + } + return citr - tdata.cbuff.begin(); + }; + + auto generate = []( unsigned size, ThreadMgr::Context& tdata ) { + for( auto i = tdata.ibuff.begin(), end = i + size; i != end; ++i ) + *i = get_random(); + }; + + auto output = []( unsigned size, ThreadMgr::Context& tdata ) { + std::fwrite( tdata.cbuff.data(), 1, size, Config::output ); + }; + + auto alu_out = [&]( unsigned size, ThreadMgr::Context& ) { + static constexpr auto alu_data = [&] { + constexpr unsigned char_count = (Config::chars_per_line + 1) + * (sizeof(alu) - 1); + + std::array< char, char_count > ret{}; + for(unsigned i = 0, j = 0; i < char_count; ++i, ++j) { + unsigned ul = std::min(i + Config::chars_per_line, char_count); + for(; i < ul; ++i ) + ret[i] = alu[ (i - j) % (sizeof(alu) - 1)]; + ret[i] = '\n'; + } + return ret; + }(); + + constexpr unsigned char_count = Config::chars_per_line * + (sizeof(alu) - 1); + for(; size >= char_count; size -= char_count ) + std::fwrite( alu_data.data(), 1, alu_data.size(), Config::output ); + std::fwrite( alu_data.data(), 1, size + + (size / Config::chars_per_line), Config::output ); + std::putc('\n', Config::output ); + }; + + auto iub_convert = []( unsigned size, unsigned start_index, + unsigned max_work, ThreadMgr::Context& tdata ) { + static constexpr auto iub_data = make_commulative( iub ); + return convert( iub_data, size, start_index, max_work, tdata ); + }; + + auto homo_convert = []( unsigned size, unsigned start_index, + unsigned max_work, ThreadMgr::Context& tdata ) { + static constexpr auto hom_data = make_commulative( homosapiens ); + return convert( hom_data, size, start_index, max_work, tdata ); + }; + + gdata.SetThreadCount( std::thread::hardware_concurrency() ); + + unsigned n = std::max( 1000, std::atoi( argv[1] ) ); + + const auto alu_proc = gdata.Make( ">ONE Homo sapiens alu\n", + n * 2, alu_out ); + + const auto iub_proc = gdata.Make( ">TWO IUB ambiguity codes\n", + n * 3, generate, iub_convert, output ); + + const auto homo_proc = gdata.Make( ">THREE Homo sapiens frequency\n", + n * 5, generate, homo_convert, output ); + + gdata.RunSequence( alu_proc, iub_proc, homo_proc ); + return 0; +} diff --git a/cpp_src/knucleotide.c++ b/cpp_src/knucleotide.c++ new file mode 100644 index 0000000..8d9633d --- /dev/null +++ b/cpp_src/knucleotide.c++ @@ -0,0 +1,220 @@ +// The Computer Language Benchmarks Game +// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +// +// Contributed by Sylvester Saguban +// taken some inspirations from C++ G++ #3 from Branimir Maksimovic +// +// Improvements to algorithm of C++ G++ #3: +// - Only doing incremental update to key instead of +// recomputing it for every insert to hash table, +// this aligns it to the other fast implementations. +// Notably C GCC, Rust #6, Rust #4. +// - Returning the hash object by moving instead of +// returning by copy. +// - Using std::thread instead of std::async so routines +// are guaranteed to run on their own threads. + +// Improvements aimed at better compiler optimizations: +// - Passing the count/string length as a function +// template argument so it is known in advance by +// the compiler. For programming languages without +// value generics, the same optimization can done +// by making those values as constants. +// - 'Key' class uses this template value so 'size' +// member variable is not needed inside the class, +// this also reduced the memory usage against the +// original implementation. + +// compile with these flags: +// -std=c++17 -march=native -msse -msse2 -msse3 -O3 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct Cfg { + static constexpr size_t thread_count = 4; + static constexpr unsigned to_char[4] = {'A', 'C', 'T', 'G'}; + static inline unsigned char to_num[128]; + using Data = std::vector; + + Cfg() { + to_num['A'] = to_num['a'] = 0; + to_num['C'] = to_num['c'] = 1; + to_num['T'] = to_num['t'] = 2; + to_num['G'] = to_num['g'] = 3; + } +} const cfg; + +template +struct Key +{ + // select type to use for 'data', if hash key can fit on 32-bit integer + // then use uint32_t else use uint64_t. + using Data = typename std::conditional::type; + + struct Hash { + Data operator()(const Key& t)const{ return t._data; } + }; + + Key(Data data) : _data(data) { + } + + // uses std::string_view instead of std::string because std::string always + // allocates a copy from the heap. while std::string_view is only a wrapper + // of a pointer and a size + Key(const std::string_view& str) { + _data = 0; + for(unsigned i = 0; i < size; ++i){ + _data <<= 2; + _data |= cfg.to_num[unsigned(str[i])]; + } + } + + // initialize hash from input data + void InitKey(auto data){ + for(unsigned i = 0; i < size; ++i){ + _data <<= 2; + _data |= data[i]; + } + } + + // updates the key with 1 byte + void UpdateKey(auto data){ + _data <<= 2; + _data |= data; + } + + // masks out excess information + void MaskKey(){ + _data &= _mask; + } + + // implicit casting operator to string + operator std::string() const { + std::string tmp; + Data data = _data; + for(size_t i = 0; i != size; ++i, data >>= 2) + tmp += cfg.to_char[data & 3ull]; + std::reverse(tmp.begin(), tmp.end()); + return std::move(tmp); + } + + bool operator== (const Key& in) const { + return _data == in._data; + } +private: + static constexpr Data _mask = ~(Data(-1) << (2 * size)); + Data _data; +}; + +template > +using HashTable = __gnu_pbds::cc_hash_table; + +template +void Calculate(const Cfg::Data& input, size_t begin, HashTable& table) +{ + // original implementation fully recomputes the hash key for each + // insert to the hash table. This implementation only partially + // updates the hash, this is the same with C GCC, Rust #6 and Rust #4 + Key key(0); + // initialize key + key.InitKey(input.data() + begin); + // use key to increment value + ++table[key]; + + auto itr_begin = input.data() + begin + cfg.thread_count; + auto itr_end = (input.data() + input.size() + 1) - size; + for(;itr_begin < itr_end; itr_begin += cfg.thread_count) { + // update the key 1 byte at a time + constexpr size_t nsize = std::min(size, cfg.thread_count); + for(unsigned i = 0; i < nsize; ++i) + key.UpdateKey( itr_begin[i] ); + // then finally mask out excess information + key.MaskKey(); + // then use key to increment value + ++table[key]; + } +} + +template +auto CalculateInThreads(const Cfg::Data& input) +{ + HashTable hash_tables[cfg.thread_count]; + std::thread threads[cfg.thread_count]; + + auto invoke = [&](unsigned begin) { + Calculate(input, begin, hash_tables[begin]); + }; + + for(unsigned i = 0; i < cfg.thread_count; ++i) + threads[i] = std::thread(invoke, i); + + for(auto& i : threads) + i.join(); + + auto& frequencies = hash_tables[0]; + for(unsigned i = 1 ; i < cfg.thread_count; ++i) + for(auto& j : hash_tables[i]) + frequencies[j.first] += j.second; + // return the 'frequency' by move instead of copy. + return std::move(frequencies); +} + +template +void WriteFrequencies(const Cfg::Data& input) +{ + // we "receive" the returned object by move instead of copy. + auto&& frequencies = CalculateInThreads(input); + std::map> freq; + for(const auto& i: frequencies) + freq.insert({i.second, i.first}); + + const unsigned sum = input.size() + 1 - size; + for(const auto& i : freq) + std::cout << i.second << ' ' << (sum ? double(100 * i.first) / sum : 0.0) << '\n'; + std::cout << '\n'; +} + +template +void WriteCount( const Cfg::Data& input, const std::string& text ) { + // we "receive" the returned object by move instead of copy. + auto&& frequencies = CalculateInThreads(input); + std::cout << frequencies[Key(text)] << '\t' << text << '\n'; +} + +int main() +{ + Cfg::Data data; + std::array buf; + + while(fgets(buf.data(), buf.size(), stdin) && memcmp(">THREE", buf.data(), 6)); + while(fgets(buf.data(), buf.size(), stdin) && buf.front() != '>') { + if(buf.front() != ';'){ + auto i = std::find(buf.begin(), buf.end(), '\n'); + data.insert(data.end(), buf.begin(), i); + } + } + std::transform(data.begin(), data.end(), data.begin(), [](auto c){ + return cfg.to_num[c]; + }); + std::cout << std::setprecision(3) << std::setiosflags(std::ios::fixed); + + WriteFrequencies<1>(data); + WriteFrequencies<2>(data); + // value at left is the length of the passed string. + WriteCount<3>(data, "GGT"); + WriteCount<4>(data, "GGTA"); + WriteCount<6>(data, "GGTATT"); + WriteCount<12>(data, "GGTATTTTAATT"); + WriteCount<18>(data, "GGTATTTTAATTTATAGT"); +} diff --git a/cpp_src/make.jl b/cpp_src/make.jl new file mode 100644 index 0000000..5ddd2b1 --- /dev/null +++ b/cpp_src/make.jl @@ -0,0 +1,10 @@ +cd(@__DIR__) +include("make_flags.jl") +for cmd in make_cmds + try + run(`g++ $cmd`) + @info "success" + catch e + @warn "failed command" exception = e + end +end diff --git a/cpp_src/make_flags.jl b/cpp_src/make_flags.jl new file mode 100644 index 0000000..df1e723 --- /dev/null +++ b/cpp_src/make_flags.jl @@ -0,0 +1,22 @@ +make_cmds = [ + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-fopenmp", "-I/usr/include/apr-1.0", "binarytrees.c++", "-o", "binarytrees.c++.o"], + ["binarytrees.c++.o", "-o", "binarytrees", "-fopenmp", "-lapr-1"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++11", "-fopenmp", "fannkuchredux.c++", "-o", "fannkuchredux.c++.o"], + ["fannkuchredux.c++.o", "-o", "fannkuchredux", "-fopenmp"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-std=c++17", "fasta.c++", "-o", "fasta.c++.o"], + ["fasta.c++.o", "-o", "fasta", "-lpthread"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++17", "knucleotide.c++", "-o", "knucleotide.c++.o"], + ["knucleotide.c++.o", "-o", "knucleotide", "-lpthread"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-fopenmp", "mandelbrot.c++", "-o", "mandelbrot.c++.o"], + ["mandelbrot.c++.o", "-o", "mandelbrot", "-fopenmp"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "nbody.c++", "-o", "nbody.c++.o"], + ["nbody.c++.o", "-o", "nbody"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++14", "-g", "pidigits.c++", "-o", "pidigits.c++.o"], + ["pidigits.c++.o", "-o", "pidigits", "-lgmp", "-lgmpxx"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++17", "-fopenmp", "-flto", "regexredux.c++", "-o", "regexredux.c++.o"], + ["regexredux.c++.o", "-o", "regexredux", "-fopenmp", "-lpcre"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-std=c++11", "-mtune=native", "-mfpmath=sse", "-msse2", "revcomp.c++", "-o", "revcomp.c++.o"], + ["revcomp.c++.o", "-o", "revcomp", "-pthread"], + ["-c", "-pipe", "-O3", "-fomit-frame-pointer", "-march=native", "-mfpmath=sse", "-msse3", "-fopenmp", "spectralnorm.c++", "-o", "spectralnorm.c++.o"], + ["spectralnorm.c++.o", "-o", "spectralnorm", "-fopenmp"], +] diff --git a/cpp_src/mandelbrot.c++ b/cpp_src/mandelbrot.c++ new file mode 100644 index 0000000..17a13b8 --- /dev/null +++ b/cpp_src/mandelbrot.c++ @@ -0,0 +1,207 @@ +// The Computer Language Benchmarks Game +// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +// +// Contributed by Kevin Miller ( as C code ) +// +// Ported to C++ with minor changes by Dave Compton +// Optimized to x86 by Kenta Yoshimura +// +// Compile with following g++ flags +// Use '-O3 -ffp-contract=off -fno-expensive-optimizations' instead of '-Ofast', +// because FMA is fast, but different precision to original version +// -Wall -O3 -ffp-contract=off -fno-expensive-optimizations -march=native -fopenmp --std=c++14 mandelbrot.cpp + +#include +#include +#include + +using namespace std; + +namespace { + +#if defined(__AVX512BW__) + typedef __m512d Vec; + Vec vec_init(double value) { return _mm512_set1_pd(value); } + bool vec_is_any_le(Vec v, Vec f) { return bool(_mm512_cmp_pd_mask(v, f, _CMP_LE_OS)); } + int vec_is_le(Vec v1, Vec v2) { return _mm512_cmp_pd_mask(v1, v2, _CMP_LE_OS); } + const uint8_t k_bit_rev[] = + { + 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, + 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, + 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, + 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, + 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, + 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, + 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, + 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, + 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, + 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, + 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, + 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, + 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, + 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, + 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, + 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF + }; +#elif defined(__AVX__) + typedef __m256d Vec; + Vec vec_init(double value) { return _mm256_set1_pd(value); } + bool vec_is_any_le(Vec v, Vec f) { Vec m = v<=f; return ! _mm256_testz_pd(m, m); } + int vec_is_le(Vec v1, Vec v2) { return _mm256_movemask_pd(v1 <= v2); } + const uint8_t k_bit_rev[] = + { + 0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110, + 0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111 + }; +#elif defined(__SSE4_1__) + typedef __m128d Vec; + Vec vec_init(double value) { return _mm_set1_pd(value); } + bool vec_is_any_le(Vec v, Vec f) { __m128i m = __m128i(v<=f); return ! _mm_testz_si128(m, m); } + int vec_is_le(Vec v1, Vec v2) { return _mm_movemask_pd(v1 <= v2); } + const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 }; +#elif defined(__SSSE3__) + typedef __m128d Vec; + Vec vec_init(double value) { return _mm_set1_pd(value); } + bool vec_is_any_le(Vec v, Vec f) { return bool(_mm_movemask_pd(v<=f)); } + int vec_is_le(Vec v1, Vec v2) { return _mm_movemask_pd(v1 <= v2); } + const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 }; +#endif + + constexpr int k_vec_size = sizeof(Vec) / sizeof(double); + + // Return true iff all of 8 members of vector v1 is + // NOT less than or equal to v2. + bool vec_all_nle(const Vec* v1, Vec v2) + { + for ( auto i = 0; i < 8/k_vec_size; i++ ) { + if ( vec_is_any_le(v1[i], v2) ) { + return false; + } + } + return true; + } + + // Return 8 bit value with bits set iff cooresponding + // member of vector value is less than or equal to limit. + unsigned pixels(const Vec* value, Vec limit) + { + unsigned res = 0; + for ( auto i = 0; i < 8/k_vec_size; i++ ) { + res <<= k_vec_size; + res |= k_bit_rev[vec_is_le(value[i], limit)]; + } + return res; + } + + // + // Do one iteration of mandelbrot calculation for a vector of eight + // complex values. Using Vec to work with groups of doubles speeds + // up computations. + // + void calcSum(Vec* real, Vec* imag, Vec* sum, const Vec* init_real, Vec init_imag) + { + for ( auto vec = 0; vec < 8/k_vec_size; vec++ ) { + auto r2 = real[vec] * real[vec]; + auto i2 = imag[vec] * imag[vec]; + auto ri = real[vec] * imag[vec]; + + sum[vec] = r2 + i2; + + real[vec]=r2 - i2 + init_real[vec]; + imag[vec]=ri + ri + init_imag; + } + } + + // + // Do 50 iterations of mandelbrot calculation for a vector of eight + // complex values. Check occasionally to see if the iterated results + // have wandered beyond the point of no return (> 4.0). + // + unsigned mand8(bool to_prune, const Vec* init_real, Vec init_imag) + { + Vec k4_0 = vec_init(4.0); + Vec real[8 / k_vec_size]; + Vec imag[8 / k_vec_size]; + Vec sum[8 / k_vec_size]; + for ( auto k = 0; k < 8/k_vec_size; k++ ) { + real[k] = init_real[k]; + imag[k] = init_imag; + } + + if ( to_prune ) { + // 4*12 + 2 = 50 + for ( auto j = 0; j < 12; j++ ) { + for ( auto k = 0; k < 4; k++ ) { + calcSum(real, imag, sum, init_real, init_imag); + } + if ( vec_all_nle(sum, k4_0) ) { + return 0; // prune + } + } + calcSum(real, imag, sum, init_real, init_imag); + calcSum(real, imag, sum, init_real, init_imag); + } else { + // 6*8 + 2 = 50 + for ( auto j = 0; j < 8; j++ ) { + for ( auto k = 0; k < 6; k++ ) { + calcSum(real, imag, sum, init_real, init_imag); + } + } + calcSum(real, imag, sum, init_real, init_imag); + calcSum(real, imag, sum, init_real, init_imag); + } + + return pixels(sum, k4_0); + } + +} // namespace + +int main(int argc, char ** argv) +{ + // get width/height from arguments + + auto wid_ht = 16000; + if ( argc >= 2 ) { + wid_ht = atoi(argv[1]); + } + + // round up to multiple of 8 + wid_ht = -(-wid_ht & -8); + auto width = wid_ht; + auto height = wid_ht; + + // allocate memory for pixels. + auto dataLength = height*(width>>3); + auto pixels = new uint8_t[dataLength]; + + // calculate initial x values, store in r0 + Vec r0[width / k_vec_size]; + double* r0_ = reinterpret_cast(r0); + for ( auto x = 0; x < width; x++ ) { + r0_[x] = 2.0 / width * x - 1.5; + } + + // generate the bitmap + + // process 8 pixels (one byte) at a time + #pragma omp parallel for schedule(guided) + for ( auto y = 0; y < height; y++ ) { + // all 8 pixels have same y value (iy). + auto iy = 2.0 / height * y - 1.0; + Vec init_imag = vec_init(iy); + auto rowstart = y*width/8; + bool to_prune = false; + for ( auto x = 0; x < width; x += 8 ) { + auto res = mand8(to_prune, &r0[x/k_vec_size], init_imag); + pixels[rowstart + x/8] = res; + to_prune = ! res; + } + } + + // write the data + printf("P4\n%d %d\n", width, height); + fwrite(pixels, 1, dataLength, stdout); + delete[] pixels; + + return 0; +} diff --git a/cpp_src/nbody.c++ b/cpp_src/nbody.c++ new file mode 100644 index 0000000..a364ea7 --- /dev/null +++ b/cpp_src/nbody.c++ @@ -0,0 +1,182 @@ +/* The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + + contributed by Mark C. Lewis + modified slightly by Chad Whipkey + converted from java to c++,added sse support, by Branimir Maksimovic + converted from c++ to c, by Alexey Medvedchikov + converted from c to c++11, by Walter Landry +*/ + +#include +#include +#include +#include +#include + +constexpr double PI(3.141592653589793); +constexpr double SOLAR_MASS ( 4 * PI * PI ); +constexpr double DAYS_PER_YEAR(365.24); + +struct body { + double x[3], fill, v[3], mass; + constexpr body(double x0, double x1, double x2, double v0, double v1, double v2, double Mass): + x{x0,x1,x2}, fill(0), v{v0,v1,v2}, mass(Mass) {} +}; + +class N_Body_System +{ + static std::array bodies; + + void offset_momentum() + { + unsigned int k; + for(auto &body: bodies) + for(k = 0; k < 3; ++k) + bodies[0].v[k] -= body.v[k] * body.mass / SOLAR_MASS; + } + +public: + N_Body_System() + { + offset_momentum(); + } + void advance(double dt) + { + constexpr unsigned int N = ((bodies.size() - 1) * bodies.size()) / 2; + + static double r[N][4]; + static double mag[N]; + + unsigned int i, m; + __m128d dx[3], dsquared, distance, dmag; + + i=0; + for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi) + { + auto bj(bi); + for(++bj; bj!=bodies.end(); ++bj, ++i) + for (m=0; m<3; ++m) + r[i][m] = bi->x[m] - bj->x[m]; + } + + for (i=0; iv[m] -= r[i][m] * bj->mass * mag[i]; + bj->v[m] += r[i][m] * bi->mass * mag[i]; + } + } + + for(auto &body: bodies) + for(m=0; m<3; ++m) + body.x[m] += dt * body.v[m]; + } + + double energy() + { + double e(0.0); + for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi) + { + e += bi->mass * ( bi->v[0] * bi->v[0] + + bi->v[1] * bi->v[1] + + bi->v[2] * bi->v[2] ) / 2.; + + auto bj(bi); + for(++bj; bj!=bodies.end(); ++bj) + { + std::array dx; + for(auto k=0; k<3; ++k) + dx[k] = bi->x[k] - bj->x[k]; + + double distance(0); + for(auto &d: dx) + distance+=d*d; + distance=std::sqrt(distance); + e -= (bi->mass * bj->mass) / distance; + } + } + return e; + } +}; + + +std::array N_Body_System::bodies{{ + /* sun */ + body(0., 0., 0. , + 0., 0., 0. , + SOLAR_MASS), + /* jupiter */ + body(4.84143144246472090e+00, + -1.16032004402742839e+00, + -1.03622044471123109e-01 , + 1.66007664274403694e-03 * DAYS_PER_YEAR, + 7.69901118419740425e-03 * DAYS_PER_YEAR, + -6.90460016972063023e-05 * DAYS_PER_YEAR , + 9.54791938424326609e-04 * SOLAR_MASS + ), + /* saturn */ + body(8.34336671824457987e+00, + 4.12479856412430479e+00, + -4.03523417114321381e-01 , + -2.76742510726862411e-03 * DAYS_PER_YEAR, + 4.99852801234917238e-03 * DAYS_PER_YEAR, + 2.30417297573763929e-05 * DAYS_PER_YEAR , + 2.85885980666130812e-04 * SOLAR_MASS + ), + /* uranus */ + body(1.28943695621391310e+01, + -1.51111514016986312e+01, + -2.23307578892655734e-01 , + 2.96460137564761618e-03 * DAYS_PER_YEAR, + 2.37847173959480950e-03 * DAYS_PER_YEAR, + -2.96589568540237556e-05 * DAYS_PER_YEAR , + 4.36624404335156298e-05 * SOLAR_MASS + ), + /* neptune */ + body(1.53796971148509165e+01, + -2.59193146099879641e+01, + 1.79258772950371181e-01 , + 2.68067772490389322e-03 * DAYS_PER_YEAR, + 1.62824170038242295e-03 * DAYS_PER_YEAR, + -9.51592254519715870e-05 * DAYS_PER_YEAR , + 5.15138902046611451e-05 * SOLAR_MASS + ) + }}; + +int main(int , char** argv) +{ + int i, n = atoi(argv[1]); + N_Body_System system; + + printf("%.9f\n", system.energy()); + for (i = 0; i < n; ++i) + system.advance(0.01); + printf("%.9f\n", system.energy()); + + return 0; +} diff --git a/cpp_src/pidigits.c++ b/cpp_src/pidigits.c++ new file mode 100644 index 0000000..43798de --- /dev/null +++ b/cpp_src/pidigits.c++ @@ -0,0 +1,68 @@ +/* The Computer Language Benchmarks Game + * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + * + * contributed by Alessandro Power + */ + +#include +#include +#include + +class LFT { +public: + mpz_class q; + mpz_class r; + mpz_class t; + unsigned k; + +public: + LFT() : q(1), r(0), t(1), k(0){}; + + void next() { + ++k; + r = (2 * k + 1) * (2 * q + r); + t = (2 * k + 1) * t; + q = q * k; + } + + unsigned extract(unsigned x) const { + static mpz_class tmp0, tmp1; + tmp0 = q * x + r; + tmp1 = tmp0 / t; + return tmp1.get_ui(); + } + + void produce(unsigned n) { + q = 10 * q; + r = 10 * (r - n * t); + } +}; + +int main(int, char** argv) { + std::ios_base::sync_with_stdio(false); + + const std::size_t TOTAL_DIGITS = std::atol(argv[1]); + + LFT lft; + std::size_t n_digits = 0; + while (n_digits < TOTAL_DIGITS) { + std::size_t i = 0; + while (i < 10 and n_digits < TOTAL_DIGITS) { + lft.next(); + if (lft.q > lft.r) continue; + + auto digit = lft.extract(3); + if (digit == lft.extract(4)) { + std::cout << digit; + lft.produce(digit); + ++i; + ++n_digits; + } + } + + // Pad digits with extra spaces if TOTAL_DIGITS was not a + // multiple of 10. + for (; i < 10; ++i) std::cout << ' '; + std::cout << "\t:" << n_digits << '\n'; + } +} diff --git a/cpp_src/regexredux.c++ b/cpp_src/regexredux.c++ new file mode 100644 index 0000000..5bf82f3 --- /dev/null +++ b/cpp_src/regexredux.c++ @@ -0,0 +1,141 @@ +/* The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + + contributed by Filip Sajdak +*/ + +#include +#include +#include +#include +#include + +template +auto reserve(size_t size) { + T out; + out.reserve(size); + return out; +} + +template +auto load(std::istream& in) { + auto str = reserve(initial_size); + auto buffer = std::array(); + + while (in) { + in.read(buffer.data(), buffer.size()); + str.append(buffer.cbegin(), buffer.cbegin()+in.gcount()); + } + + return str; +} + +template +auto make_unique_with_deleter(T* ptr, Deleter&& deleter) +{ + return std::unique_ptr(ptr, std::forward(deleter)); +} + + auto make_regex(const std::string_view pattern) { + char const* error; + int offset; + return make_unique_with_deleter( + pcre_compile(pattern.data(), 0, &error, &offset, NULL), pcre_free); + } + + auto make_aid(const pcre* regex) { + char const* error; + return make_unique_with_deleter( + pcre_study(regex, PCRE_STUDY_JIT_COMPILE, &error), pcre_free_study); + } + +static void replace(const std::string_view pattern, const std::string_view replacement, + const std::string_view source, std::string& output, pcre_jit_stack* const stack){ + + const auto regex = make_regex(pattern); + const auto aid = make_aid(regex.get()); + + int pos = 0; + + for(int match[3]; pcre_jit_exec(regex.get(), aid.get(), source.data(), + source.size(), pos, 0, match, 3, stack) >= 0; pos = match[1]){ + output.append(std::cbegin(source) + pos, std::cbegin(source) + match[0]); + output.append(std::cbegin(replacement), std::cend(replacement)); + } + + output.append(std::cbegin(source) + pos, std::cend(source)); +} + + +int main(void){ + std::ios::sync_with_stdio(false); + + char const * const count_Info[]={ + "agggtaaa|tttaccct", + "[cgt]gggtaaa|tttaccc[acg]", + "a[act]ggtaaa|tttacc[agt]t", + "ag[act]gtaaa|tttac[agt]ct", + "agg[act]taaa|ttta[agt]cct", + "aggg[acg]aaa|ttt[cgt]ccct", + "agggt[cgt]aa|tt[acg]accct", + "agggta[cgt]a|t[acg]taccct", + "agggtaa[cgt]|[acg]ttaccct" + }, * const replace_Info[][2]={ + {"tHa[Nt]", "<4>"}, + {"aND|caN|Ha[DS]|WaS", "<3>"}, + {"a[NSt]|BY", "<2>"}, + {"<[^>]*>", "|"}, + {"\\|[^|][^|]*\\|", "-"} + }; + + auto sequences = reserve(16384); + size_t postreplace_Size = 0; + + std::string input = load(std::cin); + + #pragma omp parallel + { + auto stack = make_unique_with_deleter( + pcre_jit_stack_alloc(16384, 16384), pcre_jit_stack_free); + + #pragma omp single + { + replace(">.*\\n|\\n", "", input, sequences, stack.get()); + } + + #pragma omp single nowait + { + auto prereplace_String = sequences; + auto postreplace_String = reserve(sequences.capacity()); + + for ( const auto& [pattern, replacement] : replace_Info ) { + postreplace_String.clear(); + replace(pattern, replacement, + prereplace_String, postreplace_String, stack.get()); + std::swap(prereplace_String, postreplace_String); + } + + postreplace_Size = prereplace_String.size(); + } + + #pragma omp for schedule(dynamic) ordered + for(auto i=0u; i < std::size(count_Info); i++){ + + auto regex = make_regex(count_Info[i]); + auto aid = make_aid(regex.get()); + + int count = 0; + for(int pos = 0, match[3]; pcre_jit_exec(regex.get(), aid.get(), + sequences.data(), sequences.size(), pos, 0, match, 3, + stack.get()) >= 0; pos=match[1]) { + ++count; + } + + #pragma omp ordered + printf("%s %d\n", count_Info[i], count); + } + } + + printf("\n%zu\n%zu\n%zu\n", input.size(), sequences.size(), postreplace_Size); + return 0; +} diff --git a/cpp_src/revcomp.c++ b/cpp_src/revcomp.c++ new file mode 100644 index 0000000..275d96b --- /dev/null +++ b/cpp_src/revcomp.c++ @@ -0,0 +1,282 @@ +/* The Computer Language Benchmarks Game +https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +Contributed by Andrew Moon +*/ + +#include +#include +#include +#include +#include +#include + +struct CPUs { + CPUs() { + cpu_set_t cs; + CPU_ZERO( &cs ); + sched_getaffinity( 0, sizeof(cs), &cs ); + count = 0; + for ( size_t i = 0; i < CPU_SETSIZE; i++ ) + count += CPU_ISSET( i, &cs ) ? 1 : 0; + count = std::max( count, size_t(1) ); + } + + size_t count; +} cpus; + +struct ReverseLookup { + ReverseLookup( const char *from, const char *to ) { + for ( int i = 0; i < 256; i++ ) + byteLookup[i] = i; + for ( ; *from && *to; from++, to++ ) { + byteLookup[toupper(*from)] = *to; + byteLookup[tolower(*from)] = *to; + } + + for ( int i = 0; i < 256; i++ ) + for ( int j = 0; j < 256; j++ ) + wordLookup[(i << 8) | j] = ( byteLookup[j] << 8 ) | byteLookup[i]; + } + + char operator[]( const char &c ) { return (char )byteLookup[(unsigned char )c]; } + short operator[]( const short &s ) { return (short )wordLookup[(unsigned short )s]; } + +protected: + unsigned char byteLookup[256]; + unsigned short wordLookup[256*256]; +} lookup( "acbdghkmnsrutwvy", "TGVHCDMKNSYAAWBR" ); + +template< class type > +struct vector2 : public std::vector { + type &last() { return this->operator[]( std::vector::size() -1 ); } +}; + +struct Chunker { + enum { lineLength = 60, chunkSize = 65536, }; + + Chunker( int seq ) : id(seq) {} + + struct Chunk { + Chunk() {} + Chunk( char *in, size_t amt ) : data(in), size(amt) {} + char *data; + size_t size; + }; + + void NewChunk() { + size_t cur = mark - chunkBase; + chunks.push_back( Chunk( chunkBase, cur ) ); + chunkBase += ( cur + ( cur & 1 ) ); // keep it word aligned + mark = chunkBase; + } + + template< int N > + struct LinePrinter { + LinePrinter() : lineFill(0) {} + void endofblock() { if ( lineFill ) newline(); } + void emit( const char *str, size_t amt ) { + fwrite_unlocked( str, 1, amt, stdout ); + } + void emit( char c ) { fputc_unlocked( c, stdout ); } + void emitnewline() { emit( '\n' ); } + void emitlines( char *data, size_t size ) { + if ( lineFill ) { + size_t toprint = std::min( size, lineLength - lineFill ); + emit( data, toprint ); + size -= toprint; + data += toprint; + lineFill += toprint; + if ( lineFill == lineLength ) + newline(); + } + + while ( size >= lineLength ) { + emit( data, lineLength ); + emitnewline(); + size -= lineLength; + data += lineLength; + } + + if ( size ) { + lineFill = size; + emit( data, size ); + } + } + void newline() { lineFill = 0; emitnewline(); } + void reset() { lineFill = 0; } + protected: + size_t lineFill; + }; + + void Print() { + int prevId = -( id - 1 ); + while ( __sync_val_compare_and_swap( &printQueue, prevId, id ) != prevId ) + sched_yield(); + + fwrite_unlocked( name, 1, strlen( name ), stdout ); + static LinePrinter<65536*2> line; + line.reset(); + for ( int i = int(chunks.size()) - 1; i >= 0; i-- ) + line.emitlines( chunks[i].data, chunks[i].size ); + line.endofblock(); + + __sync_val_compare_and_swap( &printQueue, id, -id ); + } + + // fseek on stdin seems flaky so this hack. not called often + void Backup() { + while ( true ) { + if ( fgetc_unlocked( stdin ) == '>' ) { + fseek( stdin, -1, SEEK_CUR ); + return; + } + fseek( stdin, -2, SEEK_CUR ); + } + } + + // input buffer can hold all of stdin, so no size checking + size_t Read( char *data ) { + if ( feof( stdin ) ) + return 0; + + name = data; + fgets_unlocked( name, 128, stdin ); + mark = chunkBase = name + strlen( name ) + 1; + mark[lineLength] = -1; + + while ( fgets_unlocked( mark, 128, stdin ) ) { + if ( *mark == '>' ) { + Backup(); + break; + } + + // mark trick should keep us from calling strlen + mark += ( mark[lineLength] != 0xa ) ? strlen( mark ) - 1 : lineLength; + if ( mark - chunkBase > chunkSize ) + NewChunk(); + + mark[lineLength] = -1; + } + + if ( mark - chunkBase ) + NewChunk(); + return ( chunkBase - data ); + } + + struct WorkerState { + Chunker *chunker; + size_t offset, count; + pthread_t handle; + }; + + static void *ReverseWorker( void *arg ) { + WorkerState *state = (WorkerState *)arg; + Chunker &chunker = *state->chunker; + for ( size_t i = 0; i < state->count; i++ ) { + Chunk &chunk = chunker[state->offset + i]; + short *w = (short *)chunk.data, *bot = w, *top = w + ( chunk.size / 2 ) - 1; + for ( ; bot < top; bot++, top-- ) { + short tmp = lookup[*bot]; + *bot = lookup[*top]; + *top = tmp; + } + // if size is odd, final byte would reverse to the start (skip it) + if ( chunk.size & 1 ) + chunk.data++; + } + return 0; + } + + void Reverse() { + if ( !chunks.size() ) + return; + + // this takes so little time it's almost not worth parallelizing + vector2 threads; + threads.reserve( cpus.count ); + size_t divs = chunks.size() / cpus.count; + for ( size_t i = 0, offset = 0; i < cpus.count; i++, offset += divs ) { + threads.push_back( WorkerState() ); + WorkerState &ws = threads.last(); + ws.chunker = this; + ws.count = ( i < cpus.count - 1 ) ? divs : chunks.size() - offset; + ws.offset = offset; + pthread_create( &ws.handle, 0, ReverseWorker, &ws ); + } + + for ( size_t i = 0; i < cpus.count; i++ ) + pthread_join( threads[i].handle, 0 ); + } + + Chunk &operator[] ( size_t i ) { return chunks[i]; } + +protected: + vector2 chunks; + char *name, *chunkBase, *mark; + int id; + static volatile int printQueue; +}; + +// used to order chunk printing +volatile int Chunker::printQueue = 0; + +struct ReverseComplement { + ReverseComplement() { + // get stdin file size + long start = ftell( stdin ); + fseek( stdin, 0, SEEK_END ); + size = ftell( stdin ) - start; + fseek( stdin, start, SEEK_SET ); + + data = new char[size + 3]; + } + + ~ReverseComplement() { + delete[] data; + } + + static void *ChunkerThread( void *arg ) { + Chunker *chunker = (Chunker *)arg; + chunker->Reverse(); + chunker->Print(); + return 0; + } + + void Run() { + vector2 chunkers; + vector2 threads; + + size_t cur = 0; + for ( int id = 1; true; id++ ) { + chunkers.push_back( new Chunker( id ) ); + + size_t read = chunkers.last()->Read( data + cur ); + cur += read; + if ( !read ) + break; + + // spawn off a thread to finish this guy up while we read another chunk in + threads.push_back( 0 ); + pthread_create( &threads.last(), 0, ChunkerThread, chunkers.last() ); + } + + for ( size_t i = 0; i < threads.size(); i++ ) + pthread_join( threads[i], 0 ); + + for ( size_t i = 0; i < chunkers.size(); i++ ) + delete chunkers[i]; + } + + +protected: + size_t size; + char *data; +}; + + +int main( int argc, const char *argv[] ) { + ReverseComplement revcom; + revcom.Run(); + return 0; +} diff --git a/cpp_src/spectralnorm.c++ b/cpp_src/spectralnorm.c++ new file mode 100644 index 0000000..36f9bfa --- /dev/null +++ b/cpp_src/spectralnorm.c++ @@ -0,0 +1,136 @@ +// The Computer Language Benchmarks Game +// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +// +// Original C contributed by Sebastien Loisel +// Conversion to C++ by Jon Harrop +// OpenMP parallelize by The Anh Tran +// Add SSE by The Anh Tran +// Additional SSE optimization by Krzysztof Jakubowski + +// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \ +// ./spec.c++ -o ./spec.run + +#include +#include +#include +#include +#include +#include + +template int Index(int i, int j) { + return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1; +} + +template +void EvalPart(double *__restrict__ src, double *__restrict__ dst, + int begin, int end, int length) { + int i = begin; + + for(; i + 1 < end; i += 2) { + __m128d sum = _mm_set_pd( + src[0] / double(Index(i + 1, 0)), + src[0] / double(Index(i + 0, 0))); + + __m128d ti = modei? + _mm_set_pd(i + 1, i + 0) : + _mm_set_pd(i + 2, i + 1); + __m128d last = _mm_set_pd( + Index(i + 1, 0), + Index(i + 0, 0)); + + for(int j = 1; j < length; j++) { + __m128d idx = last + ti + _mm_set1_pd(j); + last = idx; + sum = sum + _mm_set1_pd(src[j]) / idx; + } + + _mm_storeu_pd(dst + i + 0, sum); + } + for(; i < end; i++) { + double sum = 0; + for (int j = 0; j < length; j++) + sum += src[j] / double(Index(i, j)); + dst[i] = sum; + } + +} + +void EvalATimesU(double *src, double *dst, int begin, int end, int N) { + EvalPart<1>(src, dst, begin, end, N); +} + +void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) { + EvalPart<0>(src, dst, begin, end, N); +} + +void EvalAtATimesU(double *src, double *dst, double *tmp, + int begin, int end, int N) { + EvalATimesU (src, tmp, begin, end, N); + #pragma omp barrier + EvalAtTimesU(tmp, dst, begin, end, N); + #pragma omp barrier +} + +int GetThreadCount() { + cpu_set_t cs; + CPU_ZERO(&cs); + sched_getaffinity(0, sizeof(cs), &cs); + + int count = 0; + for (int i = 0; i < CPU_SETSIZE; ++i) + if (CPU_ISSET(i, &cs)) + ++count; + + return count; +} + +double spectral_game(int N) { + __attribute__((aligned(16))) double u[N]; + __attribute__((aligned(16))) double v[N], tmp[N]; + + double vBv = 0.0; + double vv = 0.0; + + #pragma omp parallel default(shared) num_threads(GetThreadCount()) + { + // this block will be executed by NUM_THREADS + // variable declared in this block is private for each thread + int threadid = omp_get_thread_num(); + int threadcount = omp_get_num_threads(); + int chunk = N / threadcount; + + // calculate each thread's working range [r1 .. r2) => static schedule + int begin = threadid * chunk; + int end = (threadid < (threadcount -1)) ? (begin + chunk) : N; + + for(int i = begin; i < end; i++) + u[i] = 1.0; + #pragma omp barrier + + for (int ite = 0; ite < 10; ++ite) { + EvalAtATimesU(u, v, tmp, begin, end, N); + EvalAtATimesU(v, u, tmp, begin, end, N); + } + + double sumvb = 0.0, sumvv = 0.0; + for (int i = begin; i < end; i++) { + sumvv += v[i] * v[i]; + sumvb += u[i] * v[i]; + } + + #pragma omp critical + { + vBv += sumvb; + vv += sumvv; + } + } + + return sqrt(vBv / vv); +} + +int main(int argc, char *argv[]) { + int N = ((argc >= 2) ? atoi(argv[1]) : 2000); + printf("%.9f\n", spectral_game(N)); + return 0; +} + diff --git a/fannkuchredux/cmain.jl b/fannkuchredux/cmain.jl new file mode 100644 index 0000000..40b8553 --- /dev/null +++ b/fannkuchredux/cmain.jl @@ -0,0 +1,180 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +# based on Oleg Mazurov's Java Implementation and Jeremy Zerfas' C implementation +# transliterated by Hamza Yusuf Çakır + +global const preferred_num_blocks = 24 + +struct Fannkuch + n::Int64 + blocksz::Int64 + maxflips::Vector{Int32} + chksums::Vector{Int32} + + function Fannkuch(n, nthreads) + nfact = factorial(n) + + blocksz = nfact ÷ (nfact < preferred_num_blocks ? 1 : preferred_num_blocks) + maxflips = zeros(Int32, nthreads) + chksums = zeros(Int32, nthreads) + + new(n, blocksz, maxflips, chksums) + end +end + +struct Perm + p::Vector{Int8} + pp::Vector{Int8} + count::Vector{Int32} + + function Perm(n) + p = zeros(Int8, n) + pp = zeros(Int8, n) + count = zeros(Int32, n) + + new(p, pp, count) + end +end + +Base.@propagate_inbounds @inline function first_permutation(perm::Perm, idx) + p = perm.p + pp = perm.pp + + for i = 2:length(p) + p[i] = i - 1 + end + + for i = length(p):-1:2 + ifact = factorial(i-1) + d = idx ÷ ifact + perm.count[i] = d + idx = idx % ifact + + for j = 1:i + pp[j] = p[j] + end + + for j = 1:i + p[j] = j+d <= i ? pp[j+d] : pp[j+d-i] + end + end +end + +Base.@propagate_inbounds @inline function next_permutation(perm::Perm) + p = perm.p + count = perm.count + + first = p[2] + p[2] = p[1] + p[1] = first + + i = 2 + while count[i] >= i - 1 + count[i] = 0 + + next = p[1] = p[2] + + for j = 1:i + p[j] = p[j+1] + end + + i += 1 + p[i] = first + first = next + end + count[i] += 1 + nothing +end + +Base.@propagate_inbounds @inline function count_flips(perm::Perm) + p = perm.p + pp = perm.pp + + flips = Int32(1) + + first = p[1] + 1 + + if p[first] != 0 + + unsafe_copyto!(pp, 2, p, 2, length(p) - 1) + + while true + flips += one(flips) + new_first = pp[first] + pp[first] = first - 1 + + if first > 3 + lo = 2; hi = first - 1 + # see the note in Jeremy Zerfas' C implementation for + # this loop + for k = 0:13 + t = pp[lo] + pp[lo] = pp[hi] + pp[hi] = t + (hi < lo + 3) && break + lo += 1 + hi -= 1 + end + end + + first = new_first + 1 + pp[first] == 0 && break + end + end + + return flips +end + +Base.@propagate_inbounds function run_task(f::Fannkuch, perm::Perm, idxmin, idxmax) + maxflips = Int32(0) + chksum = Int32(0) + + i = idxmin + while true + if perm.p[1] != 0 + flips = count_flips(perm) + maxflips = max(maxflips, flips) + chksum += iseven(i) ? flips : -flips + end + i != idxmax || break + i += 1 + next_permutation(perm) + end + + id = Threads.threadid() + f.maxflips[id] = max(f.maxflips[id], maxflips) + f.chksums[id] += chksum + nothing +end + +function runf(f::Fannkuch) + factn = factorial(f.n) + + Threads.@threads for idxmin = 0:f.blocksz:factn-1 + perm = Perm(f.n) + @inbounds first_permutation(perm, idxmin) + idxmax = idxmin + f.blocksz - 1 + @inbounds run_task(f, perm, idxmin, idxmax) + end +end + +function fannkuchredux(n) + f = Fannkuch(n, Threads.nthreads()) + + runf(f) + + # reduce results + chk = sum(f.chksums) + res = maximum(f.maxflips) + + println(chk, "\nPfannkuchen(", n, ") = ", res) +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + + n = parse(Int, ARGS[1]) + fannkuchredux(n) + return 0 +end + +julia_main(["12"]) diff --git a/fasta/cmain.jl b/fasta/cmain.jl new file mode 100644 index 0000000..a33dc7e --- /dev/null +++ b/fasta/cmain.jl @@ -0,0 +1,95 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +const line_width = 60 + +const alu = string( + "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG", + "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA", + "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT", + "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA", + "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG", + "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC", + "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA") + +const iub1 = b"acgtBDHKMNRSVWY" +const iub2 = [0.27, 0.12, 0.12, 0.27, 0.02,0.02, 0.02, 0.02, 0.02, 0.02,0.02, 0.02, 0.02, 0.02, 0.02] + +const homosapiens1 = b"acgt" +const homosapiens2 = [0.3029549426680, 0.1979883004921,0.1975473066391, 0.3015094502008] + +const IM = Int32(139968) +const IA = Int32(3877) +const IC = Int32(29573) +const state = Ref(Int32(42)) +gen_random() = (state[] = ((state[] * IA) + IC) % IM) + +function repeat_fasta(io, src, n) + k = length(src) + s = string(src, src, src[1:(n % k)]) + I = Iterators.cycle(src) + col = 1 + count = 1 + c, state = iterate(I) + write(io, c % UInt8) + while count < n + col += 1 + c, state = iterate(I, state) + write(io, c % UInt8) + if col == line_width + write(io, '\n') + col = 0 + end + count += 1 + end + write(io, '\n') + return +end + +function choose_char(cs) + k = length(cs) + r = gen_random() / IM + r < cs[1] && return 1 + a = 1 + b = k + while b > a + 1 + c = fld(a + b, 2) + if r < cs[c] + b = c + else + a = c + end + end + return b +end + +function random_fasta(io, symb, pr, n) + cs = cumsum(pr) + k = n + while k > 0 + m = min(k, line_width) + @inbounds for i = 1:m + write(io, symb[choose_char(cs)]) + end + write(io, '\n'%UInt8) + k -= line_width + end + return +end + +function perf_fasta(n=25000000, io = stdout) + write(io, ">ONE Homo sapiens alu\n") + repeat_fasta(io, alu, 2n) + + write(io, ">TWO IUB ambiguity codes\n") + random_fasta(io, iub1, iub2, 3n) + write(io, ">THREE Homo sapiens frequency\n") + random_fasta(io, homosapiens1, homosapiens2, 5n) +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + n = parse(Int, ARGS[1]) + perf_fasta(n) + return 0 +end + +perf_fasta(25000, IOBuffer()) diff --git a/fasta/fasta-fast.jl b/fasta/fasta-fast.jl index c7c1822..def6b92 100644 --- a/fasta/fasta-fast.jl +++ b/fasta/fasta-fast.jl @@ -3,14 +3,7 @@ const line_width = 60 -const alu = string( - "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG", - "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA", - "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT", - "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA", - "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG", - "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC", - "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA") +const alu = b"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA" const iub1 = b"acgtBDHKMNRSVWY" const iub2 = [0.27, 0.12, 0.12, 0.27, 0.02,0.02, 0.02, 0.02, 0.02, 0.02,0.02, 0.02, 0.02, 0.02, 0.02] @@ -31,18 +24,18 @@ function repeat_fasta(io, src, n) col = 1 count = 1 c, state = iterate(I) - write(io, c % UInt8) + write(io, c) while count < n col += 1 c, state = iterate(I, state) - write(io, c % UInt8) + write(io, c) if col == line_width - write(io, '\n') + write(io, '\n' % UInt8) col = 0 end count += 1 end - write(io, '\n') + write(io, '\n' % UInt8) return end @@ -77,7 +70,7 @@ function random_fasta(io, symb, pr, n) return end -function perf_fasta(_n=25000000, io = stdout) +function perf_fasta(n=25000000, io = stdout) write(io, ">ONE Homo sapiens alu\n") repeat_fasta(io, alu, 2n) @@ -87,5 +80,5 @@ function perf_fasta(_n=25000000, io = stdout) random_fasta(io, homosapiens1, homosapiens2, 5n) end -n = parse(Int,ARGS[1]) -perf_fasta(n) +# n = parse(Int,ARGS[1]) +@time perf_fasta(25000000, IOBuffer()) diff --git a/jl_build/.gitignore b/jl_build/.gitignore new file mode 100644 index 0000000..2a928ab --- /dev/null +++ b/jl_build/.gitignore @@ -0,0 +1,13 @@ +image +image.jl +*-input.txt +cdriver/binarytrees.c +cdriver/fannkuchredux.c +cdriver/fasta.c +cdriver/knucleotide.c +cdriver/mandelbrot.c +cdriver/nbody.c +cdriver/pidigits.c +cdriver/regexredux.c +cdriver/revcomp.c +cdriver/spectralnorm.c diff --git a/jl_build/cdriver/program.c b/jl_build/cdriver/program.c new file mode 100644 index 0000000..67a6d0a --- /dev/null +++ b/jl_build/cdriver/program.c @@ -0,0 +1,54 @@ +// This file is a part of Julia. License is MIT: http://julialang.org/license + +// Standard headers +#include +#include + +// Julia headers (for initialization and gc commands) +#include "uv.h" +#include "julia.h" + +#ifdef JULIA_DEFINE_FAST_TLS // only available in Julia v0.7 and above +JULIA_DEFINE_FAST_TLS() +#endif + +// Declare C prototype of a function defined in Julia +extern int julia_main(jl_array_t*); + +// main function (windows UTF16 -> UTF8 argument conversion code copied from julia's ui/repl.c) +int main(int argc, char *argv[]) +{ + int retcode; + int i; + uv_setup_args(argc, argv); // no-op on Windows + + // initialization + libsupport_init(); + + // jl_options.compile_enabled = JL_OPTIONS_COMPILE_OFF; + // JULIAC_PROGRAM_LIBNAME defined on command-line for compilation + jl_options.image_file = JULIAC_PROGRAM_LIBNAME; + julia_init(JL_IMAGE_JULIA_HOME); + + // Initialize Core.ARGS with the full argv. + jl_set_ARGS(argc, argv); + + // Set PROGRAM_FILE to argv[0]. + jl_set_global(jl_base_module, + jl_symbol("PROGRAM_FILE"), (jl_value_t*)jl_cstr_to_string(argv[0])); + + // Set Base.ARGS to `String[ unsafe_string(argv[i]) for i = 1:argc ]` + jl_array_t *ARGS = (jl_array_t*)jl_get_global(jl_base_module, jl_symbol("ARGS")); + jl_array_grow_end(ARGS, argc - 1); + for (i = 1; i < argc; i++) { + jl_value_t *s = (jl_value_t*)jl_cstr_to_string(argv[i]); + jl_arrayset(ARGS, s, i - 1); + } + + // call the work function, and get back a value + retcode = julia_main(ARGS); + + // Cleanup and gracefully exit + jl_atexit_hook(retcode); + return retcode; +} diff --git a/jl_build/make.jl b/jl_build/make.jl new file mode 100644 index 0000000..676097c --- /dev/null +++ b/jl_build/make.jl @@ -0,0 +1,60 @@ +using PackageCompiler +using Libdl + +benchmarks = [ + ("binarytrees"), + ("fannkuchredux"), + ("fasta"), + ("knucleotide"), + ("mandelbrot"), + ("nbody"), + ("pidigits"), + ("regexredux"), + ("revcomp"), + ("spectralnorm"), +] + +cc_flags = String[] +if Sys.iswindows() + using WinRPM + push!(cc_flags, "-I" * joinpath(WinRPM.installdir, "usr", "$(Sys.ARCH)-w64-mingw32", "sys-root", "mingw", "include")) +end + +# Concat all cmains into one shared image (speeds up compilation a lot)! +@info "creating one big image file" +open(joinpath(@__DIR__, "image.jl"), "w") do io + for bench in benchmarks + input = joinpath(@__DIR__, "..", bench, string(bench, "-input.txt")) + isfile(input) && cp(input, joinpath(@__DIR__, string(bench, "-input.txt")), force = true) + jlfile = joinpath(@__DIR__, "..", bench, "cmain.jl") + println(io, "module ", titlecase(bench)) + print(io, replace(read(jlfile, String), "julia_main" => string(bench, "_main"))) + println(io, "export ", string(bench, "_main")) + println(io, "end\nusing .", titlecase(bench)) + println(io) + end +end +# build the image/shared library +@info "building shared image file" + +build_shared_lib( + joinpath(@__DIR__, "image.jl"), "bench_image", # Julia script containing a `julia_main` function, e.g. like `examples/hello.jl` + builddir = joinpath(@__DIR__, "image"), # that's where the compiled artifacts will end up [optional] +) + +# Create an executable for each benchmark, linking into the image +@info "building executables" +mkpath(joinpath(@__DIR__, "cdriver")) +for bench in benchmarks + cprog = joinpath(@__DIR__, "cdriver", "program.c") + cfile = joinpath(@__DIR__, "cdriver", string(bench, ".c")) + open(cfile, "w") do io + print(io, replace(read(cprog, String), "julia_main" => string(bench, "_main"))) + end + PackageCompiler.build_exec( + bench, cfile, + escape_string(joinpath(@__DIR__, "image", "bench_image.$(Libdl.dlext)")), + joinpath(@__DIR__, "image"), + true, "3", false, PackageCompiler.system_compiler, cc_flags + ) +end diff --git a/knucleotide/cmain.jl b/knucleotide/cmain.jl new file mode 100644 index 0000000..e38558d --- /dev/null +++ b/knucleotide/cmain.jl @@ -0,0 +1,161 @@ +# Th# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +# +# contributed by David Campbell +# based on the Go version +# modified by Jarrett Revels, Alex Arslan, Yichao Yu +# +# Bit-twiddle optimizations added by Kristoffer Carlsson + +using Printf + +const NucleotideLUT = zeros(UInt8, 256) +NucleotideLUT['A'%UInt8] = 0 +NucleotideLUT['C'%UInt8] = 1 +NucleotideLUT['G'%UInt8] = 2 +NucleotideLUT['T'%UInt8] = 3 +NucleotideLUT['a'%UInt8] = 0 +NucleotideLUT['c'%UInt8] = 1 +NucleotideLUT['g'%UInt8] = 2 +NucleotideLUT['t'%UInt8] = 3 + +struct KNucleotides{L, T} + i::T +end +Base.hash(kn::KNucleotides, h::UInt64) = hash(kn.i, h) +Base.isequal(kn1::KNucleotides, kn2::KNucleotides) = kn1.i == kn2.i +Base.show(io::IO, kn::KNucleotides) = print(io, '[', string(kn), ']') +function determine_inttype(l) + l <= 4 && return UInt8 + l <= 8 && return UInt16 + l <= 16 && return UInt32 + l <= 32 && return UInt64 + error("invalid length") +end + +function KNucleotides{L, T}(str::String) where {L, T} + i = T(0) + @inbounds for j in 1:L + b = codeunit(str, j) + i = (i << 2) | NucleotideLUT[b] + end + return KNucleotides{L, T}(i) +end + +@inline function shift(kn::KNucleotides{L, T}, c::UInt8) where {L, T} + i = kn.i + i &= (~(3 << 2(L-1)) % T) + KNucleotides{L, T}((i << T(2)) | @inbounds NucleotideLUT[c]) +end + +function Base.string(kn::KNucleotides{L}) where {L} + sprint() do io + for j in 1:L + mask = 3 << (2(L-j)) + z = (kn.i & mask) >> 2(L-j) + write(io, + z == 0 ? 'A' : + z == 1 ? 'C' : + z == 2 ? 'G' : + z == 3 ? 'T' : + error()) + end + end +end + +function count_data(data::String, ::Type{KNucleotides{L, T}}) where {L, T} + counts = Dict{KNucleotides{L, T}, Int}() + kn = KNucleotides{L, T}(data) + counts[kn] = 1 + @inbounds for offset = (L+1):sizeof(data) + c = codeunit(data, offset) + kn = shift(kn, c) + token = Base.ht_keyindex2!(counts, kn) + if token > 0 + counts.vals[token] += 1 + else + Base._setindex!(counts, 1, kn, -token) + end + end + return counts +end + +function count_one(data::String, s::String) + L = length(s) + K = KNucleotides{L, determine_inttype(L)} + d = count_data(data, K) + return get(d, K(s), 0) +end + +struct KNuc + name::String + count::Int +end + +# sort down +function Base.isless(x::KNuc, y::KNuc) + if x.count == y.count + return x.name > y.name + end + x.count > y.count +end + +function sorted_array(m) + kn = Vector{KNuc}(undef, length(m)) + i = 1 + for (k, v) in m + kn[i] = KNuc(string(k), v) + i += 1 + end + sort!(kn) +end + +do_work(str::String, i::Int) = sorted_array(count_data(str, KNucleotides{i, determine_inttype(i)})) +do_work(str::String, i::String) = count_one(str, i) + +function print_knucs(io, a::Array{KNuc, 1}) + sum = 0 + for kn in a + sum += kn.count + end + for kn in a + @printf(io, "%s %.3f\n", kn.name, 100.0kn.count/sum) + end + println(io) +end + +function perf_k_nucleotide(io = stdin, output = stdout) + three = ">THREE " + while true + line = readline(io) + if startswith(line, three) + break + end + end + str = sprint() do sio + foreach(line-> write(sio, line), eachline(io)) + end + vs = (1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT") + results = Vector{Any}(undef, length(vs)) + Threads.@threads for i in length(vs):-1:1 + results[i] = do_work(str, vs[i]) + end + + for (v, result) in zip(vs, results) + if result isa Array + print_knucs(output, result) + end + if result isa Int + @printf(output, "%d\t%s\n", result, v) + end + end +end + +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + perf_k_nucleotide() + return 0 +end + +open(joinpath(@__DIR__, "knucleotide-input.txt")) do io + perf_k_nucleotide(io, IOBuffer()) +end diff --git a/knucleotide/knucleotide-fast.jl b/knucleotide/knucleotide-fast.jl index 12df400..fa93d84 100644 --- a/knucleotide/knucleotide-fast.jl +++ b/knucleotide/knucleotide-fast.jl @@ -10,9 +10,6 @@ using Distributed using Printf -addprocs(4) - -@everywhere begin const NucleotideLUT = zeros(UInt8, 256) NucleotideLUT['A'%UInt8] = 0 NucleotideLUT['C'%UInt8] = 1 @@ -117,8 +114,6 @@ end do_work(str::String, i::Int) = sorted_array(count_data(str, KNucleotides{i,determine_inttype(i)})) do_work(str::String, i::String) = count_one(str, i) -end # @everywhere - function print_knucs(a::Array{KNuc, 1}) sum = 0 for kn in a @@ -141,16 +136,10 @@ function perf_k_nucleotide(io = stdin) data = read(io, String) str = filter(!isequal('\n'), data) - vs = [1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT"] + vs = (1, 2, "GGT", "GGTA", "GGTATT", "GGTATTTTAATT", "GGTATTTTAATTTATAGT") results = Vector{Any}(undef, length(vs)) - order = collect(enumerate(vs)) - - @sync for w in workers() - @async while !isempty(order) - v = pop!(order) - r = remotecall_fetch(do_work, w, str, v[2]) - results[v[1]] = r - end + for (i, v) in enumerate(vs) + results[i] = do_work(w, str, v[2]) end for (v, result) in zip(vs, results) diff --git a/mandelbrot/cmain.jl b/mandelbrot/cmain.jl new file mode 100644 index 0000000..e814b6a --- /dev/null +++ b/mandelbrot/cmain.jl @@ -0,0 +1,80 @@ +#= +The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + direct transliteration of the swift#3 program by Ralph Ganszky and Daniel Muellenborn: + https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-swift-3.html + modified for Julia 1.0 by Simon Danisch +=# +const zerov8 = ntuple(x-> 0f0, 8) + +@inline function step_mandel(Zr,Zi,Tr,Ti,cr,ci) + Zi = 2f0 .* Zr .* Zi .+ ci + Zr = Tr .- Ti .+ cr + Tr = Zr .* Zr + Ti = Zi .* Zi + return Zr,Zi,Tr,Ti +end + +# Calculate mandelbrot set for one Vec8 into one byte +Base.@propagate_inbounds function mand8(cr, ci) + Zr = zerov8 + Zi = zerov8 + Tr = zerov8 + Ti = zerov8 + t = zerov8 + i = 0 + + while i<50 + for _ in 1:5 + Zr,Zi,Tr,Ti = step_mandel(Zr,Zi,Tr,Ti,cr,ci) + i += 1 + end + t = Tr .+ Ti + all(x-> x > 4f0, t) && (return 0x00) + end + byte = 0xff + t[1] <= 4.0 || (byte &= 0b01111111) + t[2] <= 4.0 || (byte &= 0b10111111) + t[3] <= 4.0 || (byte &= 0b11011111) + t[4] <= 4.0 || (byte &= 0b11101111) + t[5] <= 4.0 || (byte &= 0b11110111) + t[6] <= 4.0 || (byte &= 0b11111011) + t[7] <= 4.0 || (byte &= 0b11111101) + t[8] <= 4.0 || (byte &= 0b11111110) + return byte +end + +function mandel_inner(rows, ci, y, N, xvals) + @simd for x in 1:8:N + @inbounds begin + cr = ntuple(i-> xvals[x + (i - 1)], 8) + rows[((y-1)*N÷8+(x-1)÷8) + 1] = mand8(cr, ci) + end + end +end + +function mandelbrot(n = 200, io = stdout) + inv_ = 2.0 / n + N = n + xvals = zeros(Float32, n) + yvals = zeros(Float32, n) + Threads.@threads for i in 0:(N-1) + @inbounds xvals[i + 1] = i * inv_ - 1.5 + @inbounds yvals[i + 1] = i * inv_ - 1.0 + end + rows = zeros(UInt8, n*N÷8) + Threads.@threads for y in 1:N + @inbounds ci = yvals[y] + mandel_inner(rows, ci, y, N, xvals) + end + write(io, "P4\n$n $n\n") + write(io, rows) +end + +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + n = parse(Int, ARGS[1]) + mandelbrot(n, stdout) + return 0 +end + +julia_main(["8"]) diff --git a/nbody/cmain.jl b/nbody/cmain.jl new file mode 100644 index 0000000..d58b226 --- /dev/null +++ b/nbody/cmain.jl @@ -0,0 +1,137 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +# based on the Java version + +module NBody + +using Printf +using LinearAlgebra + +# Constants +const solar_mass = 4π^2 +const days_per_year = 365.24 + +# Use a 4-tuple here to get better SIMD instructions. +# This is a space-time tradeoff, but this benchmark is well within L1 cache limits. +struct Vec3 + x::NTuple{4, Float64} +end +Vec3(x, y, z) = Vec3((x,y,z,0.0)) +Base.:/(v::Vec3, n::Number) = Vec3(1/n .* v.x) +Base.:*(v::Vec3, n::Number) = Vec3(n .* v.x) +Base.:-(v1::Vec3, v2::Vec3) = Vec3(v1.x .- v2.x) +Base.:+(v1::Vec3, v2::Vec3) = Vec3(v1.x .+ v2.x) +# Todo, prettify +squarednorm(v1::Vec3) = v1.x[1]^2 + v1.x[2]^2 + v1.x[3]^2 +Base.muladd(x::Vec3, y::Number, z::Vec3) = Vec3(muladd.(x.x, y, z.x)) + +# A heavenly body in the system +mutable struct Body + pos::Vec3 + v::Vec3 + mass::Float64 +end + +function offset_momentum!(b::Body, p::Vec3) + b.v -= p / solar_mass +end + +function init_sun!(bodies::Vector{Body}) + p = Vec3(0.0, 0.0, 0.0) + for b in bodies + p += b.v * b.mass + end + offset_momentum!(bodies[1], p) +end + +function advance(bodies::Vector{Body}, dt::Number) + @inbounds for i = 1:length(bodies) + bi = bodies[i] + for j = i+1:length(bodies) + bj = bodies[j] + delta = bi.pos - bj.pos + dsq = squarednorm(delta) + distance = sqrt(dsq) + mag = dt / (dsq * distance) + bi.v = muladd(delta, -(bj.mass * mag), bi.v) + bj.v = muladd(delta, (bi.mass * mag), bj.v) + end + end + + for b in bodies + b.pos = muladd(b.v, dt, b.pos) + end +end + +function energy(bodies::Vector{Body}) + e = 0.0 + @inbounds for i = 1:length(bodies) + bi = bodies[i] + e += 0.5 * bi.mass * squarednorm(bi.v) + for j = i+1:length(bodies) + bj = bodies[j] + delta = bi.pos - bj.pos + distance = sqrt(squarednorm(delta)) + dinv = 1.0 / distance + e = muladd((bi.mass * bj.mass), -dinv, e) + end + end + return e +end + + +function perf_nbody(N::Int=1000) + jupiter = Body( Vec3(4.84143144246472090e+00, # pos[1] = x + -1.16032004402742839e+00, # pos[2] = y + -1.03622044471123109e-01), # pos[3] = z + Vec3(1.66007664274403694e-03 * days_per_year, # v[1] = vx + 7.69901118419740425e-03 * days_per_year, # v[2] = vy + -6.90460016972063023e-05 * days_per_year), # v[3] = vz + 9.54791938424326609e-04 * solar_mass) # mass + + saturn = Body(Vec3(8.34336671824457987e+00, + 4.12479856412430479e+00, + -4.03523417114321381e-01), + Vec3(-2.76742510726862411e-03 * days_per_year, + 4.99852801234917238e-03 * days_per_year, + 2.30417297573763929e-05 * days_per_year), + 2.85885980666130812e-04 * solar_mass) + + uranus = Body(Vec3(1.28943695621391310e+01, + -1.51111514016986312e+01, + -2.23307578892655734e-01), + Vec3(2.96460137564761618e-03 * days_per_year, + 2.37847173959480950e-03 * days_per_year, + -2.96589568540237556e-05 * days_per_year), + 4.36624404335156298e-05 * solar_mass) + + neptune = Body(Vec3(1.53796971148509165e+01, + -2.59193146099879641e+01, + 1.79258772950371181e-01), + Vec3(2.68067772490389322e-03 * days_per_year, + 1.62824170038242295e-03 * days_per_year, + -9.51592254519715870e-05 * days_per_year), + 5.15138902046611451e-05 * solar_mass) + + sun = Body(Vec3(0.0, 0.0, 0.0), Vec3(0.0, 0.0, 0.0), solar_mass) + + bodies = [sun, jupiter, saturn, uranus, neptune] + + init_sun!(bodies) + @printf("%.9f\n", energy(bodies)) + for i = 1:N + advance(bodies, 0.01) + end + @printf("%.9f\n", energy(bodies)) +end + +end + +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + n = parse(Int,ARGS[1]) + NBody.perf_nbody(n) + return 0 +end + +julia_main(["50000"]) diff --git a/pidigits/cmain.jl b/pidigits/cmain.jl new file mode 100644 index 0000000..49bbe1a --- /dev/null +++ b/pidigits/cmain.jl @@ -0,0 +1,76 @@ +#= +The Computer Language Benchmarks Game + https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + based on the C++ program of Alessandro Power + https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/pidigits-gpp-4.html + with ideas borrowed from the C program of Mr Ledrug + https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/pidigits-gcc-1.html + modified for Julia 1.0 by Luiz M. Faria +=# + +# import a few functions from GMP wrapper to do in-place operations +using Base.GMP.MPZ: add_ui!, mul_ui!, add!, tdiv_q! +# and add a few more wrappers for GMP functions not currently in Base +const mpz_t = Ref{BigInt} +addmul_ui!(x::BigInt,a::BigInt,b) = ccall((:__gmpz_addmul_ui,"libgmp"), Cvoid, (mpz_t,mpz_t,Culong), x, a, b) +submul_ui!(x::BigInt,a::BigInt,b) = ccall((:__gmpz_submul_ui,"libgmp"), Cvoid, (mpz_t,mpz_t,Culong), x, a, b) +get_ui(x::BigInt) = ccall((:__gmpz_get_ui,"libgmp"), Culong, (mpz_t,), x) + +function next!(q,r,t,k) + mult = 2*k + 1 + addmul_ui!(r,q,2) + mul_ui!(r,mult) + mul_ui!(t,mult) + mul_ui!(q,k) + return nothing +end + +function extract!(q,r,t,tmp1,tmp2,n) + mul_ui!(tmp1,q,n) + add!(tmp1,r) + tdiv_q!(tmp2,tmp1,t) + return get_ui(tmp2) +end + +function produce!(q,r,t,n) + mul_ui!(q,10) + submul_ui!(r,t,n) + mul_ui!(r,10) + return nothing +end + +function pidigits(n::Int) + q,r,t,tmp1,tmp2 = map(BigInt,(1,0,1,0,0)) + k = UInt(0) + n_digits=0 + + while n_digitsr && continue + + digit = extract!(q,r,t,tmp1,tmp2,3) + if digit == extract!(q,r,t,tmp1,tmp2,4) + print(digit) + produce!(q,r,t,digit) + i +=1 + n_digits+=1 + end + end + while i < 10 + i+=1 + print(' ') + end + print("\t:$n_digits\n") + end +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + + n = parse(Int,ARGS[1]) + pidigits(n) + return 0 +end + +julia_main(["10000"]) diff --git a/regexredux/cmain.jl b/regexredux/cmain.jl new file mode 100644 index 0000000..0521439 --- /dev/null +++ b/regexredux/cmain.jl @@ -0,0 +1,61 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +# +# contributed by Daniel Jones +# fixed by David Campbell +# modified by Jarrett Revels, Alex Arslan, Yichao Yu + +using Printf + +const variants = [ + "agggtaaa|tttaccct", + "[cgt]gggtaaa|tttaccc[acg]", + "a[act]ggtaaa|tttacc[agt]t", + "ag[act]gtaaa|tttac[agt]ct", + "agg[act]taaa|ttta[agt]cct", + "aggg[acg]aaa|ttt[cgt]ccct", + "agggt[cgt]aa|tt[acg]accct", + "agggta[cgt]a|t[acg]taccct", + "agggtaa[cgt]|[acg]ttaccct" +] + +const subs = [ + (r"tHa[Nt]", "<4>"), + (r"aND|caN|Ha[DS]|WaS", "<3>"), + (r"a[NSt]|BY", "<2>"), + (r"<[^>]*>", "|"), + (r"\|[^|][^|]*\|", "-") +] + +function perf_regex_dna(io = stdin) + seq = read(io, String) + l1 = length(seq) + + seq = replace(seq, r">.*\n|\n" => "") + l2 = length(seq) + + for v in variants + k = 0 + for m in eachmatch(Regex(v), seq) + k += 1 + end + @printf("%s %d\n", v, k) + end + + for (u, v) in subs + seq = replace(seq, u => v) + end + + println() + println(l1 + 1) # why + 1?? + println(l2) + println(length(seq)) +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + perf_regex_dna() + return 0 +end + +open(joinpath(@__DIR__, "regexredux-input.txt")) do io + perf_regex_dna(io) +end diff --git a/regexredux/regexredux.jl b/regexredux/regexredux.jl index efe8497..3179ad0 100644 --- a/regexredux/regexredux.jl +++ b/regexredux/regexredux.jl @@ -7,7 +7,7 @@ using Printf -const variants = [ +const variants = Regex.([ "agggtaaa|tttaccct", "[cgt]gggtaaa|tttaccc[acg]", "a[act]ggtaaa|tttacc[agt]t", @@ -17,39 +17,81 @@ const variants = [ "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct", "agggtaa[cgt]|[acg]ttaccct" -] +]) const subs = [ - (r"tHa[Nt]", "<4>"), - (r"aND|caN|Ha[DS]|WaS", "<3>"), - (r"a[NSt]|BY", "<2>"), - (r"<[^>]*>", "|"), - (r"\|[^|][^|]*\|", "-") + (r"tHa[Nt]" => "<4>"), + (r"aND|caN|Ha[DS]|WaS" => "<3>"), + (r"a[NSt]|BY" => "<2>"), + (r"<[^>]*>" => "|"), + (r"\|[^|][^|]*\|" => "-") ] -function perf_regex_dna() - seq = read(stdin, String) - l1 = length(seq) +function replace_fast!(out::IO, str::AbstractString, pat_repl::Pair) + pattern, repl = pat_repl + n = 1 + e = lastindex(str) + i = a = firstindex(str) + r = something(findnext(pattern,str,i), 0) + j, k = first(r), last(r) + while j != 0 + if i == a || i <= k + Base.unsafe_write(out, pointer(str, i), UInt(j - i)) + Base._replace(out, repl, str, r, pattern) + end + if k < j + i = j + j > e && break + k = nextind(str, j) + else + i = k = nextind(str, k) + end + r = something(findnext(pattern,str,k), 0) + r == 0:-1 && break + j, k = first(r), last(r) + n += 1 + end + write(out, SubString(str, i)) +end +function inner(subs, res) + for i in 2:length(subs) + res = replace(res, subs[i]) + end + res +end +function perf_regex_dna(io = stdin, output = stdout) + N = Threads.nthreads() + seq = read(io, String) + l1 = length(seq) seq = replace(seq, r">.*\n|\n" => "") l2 = length(seq) - - for v in variants - k = 0 - for m in eachmatch(Regex(v), seq) + res = Vector{Tuple{Int, Int}}(undef, N) + Threads.@threads for i in 1:length(variants) + k = 0; v = variants[i] + for m in eachmatch(v, seq) k += 1 end - @printf("%s %d\n", v, k) + @inbounds res[Threads.threadid()] = (i, k) end - - for (u, v) in subs - seq = replace(seq, u => v) + nchunk = length(seq) ÷ N + results = zeros(N) + for id in 1:N + chunk = SubString(seq, (id - 1) * nchunk + 1, min(id * nchunk, length(seq))) + resio = IOBuffer(sizehint = nchunk) + for sub in subs + replace_fast!(resio, String(take!(resio)), sub) + end + results[id] += resio.size end - println() - println(l1 + 1) # why + 1?? - println(l2) - println(length(seq)) + println(output) + println(output, l1 + 1) # why + 1?? + println(output, l2) + println(output, length(seq)) end -perf_regex_dna() +# perf_regex_dna() +cd(@__DIR__) +@time perf_regex_dna("../fasta.txt", IOBuffer()); +x = IOBuffer() diff --git a/revcomp/cmain.jl b/revcomp/cmain.jl new file mode 100644 index 0000000..dc8a3e8 --- /dev/null +++ b/revcomp/cmain.jl @@ -0,0 +1,104 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ +# +# contributed by David Campbell +# modified by Jarrett Revels, Kristoffer Carlsson, Alex Arslan + +mutable struct PushVector{T, A<:AbstractVector{T}} <: AbstractVector{T} + v::A + l::Int +end + +PushVector{T}() where {T} = PushVector(Vector{T}(undef, 60), 0) + +Base.IndexStyle(::Type{<:PushVector}) = IndexLinear() +Base.length(v::PushVector) = v.l +Base.size(v::PushVector) = (v.l,) +@inline function Base.getindex(v::PushVector, i) + @boundscheck checkbounds(v, i) + @inbounds v.v[i] +end + +function Base.push!(v::PushVector, i) + v.l += 1 + if v.l > length(v.v) + resize!(v.v, v.l * 2) + end + v.v[v.l] = i + return v +end + +function Base.resize!(v::PushVector, l::Integer) + # Only support shrinking for now, since that is all we need + @assert l <= v.l + v.l = l +end + +const _revcompdata = Dict( + 'A'=> 'T', 'a'=> 'T', + 'C'=> 'G', 'c'=> 'G', + 'G'=> 'C', 'g'=> 'C', + 'T'=> 'A', 't'=> 'A', + 'U'=> 'A', 'u'=> 'A', + 'M'=> 'K', 'm'=> 'K', + 'R'=> 'Y', 'r'=> 'Y', + 'W'=> 'W', 'w'=> 'W', + 'S'=> 'S', 's'=> 'S', + 'Y'=> 'R', 'y'=> 'R', + 'K'=> 'M', 'k'=> 'M', + 'V'=> 'B', 'v'=> 'B', + 'H'=> 'D', 'h'=> 'D', + 'D'=> 'H', 'd'=> 'H', + 'B'=> 'V', 'b'=> 'V', + 'N'=> 'N', 'n'=> 'N', +) + +const revcompdata = zeros(UInt8, 256) +for (k, v) in _revcompdata + revcompdata[k%UInt8] = v%UInt8 +end + +function print_buff(outio, bb) + b = resize!(bb.v, length(bb)) + l = length(b) + length(b) == 0 && return + + br = reverse!(b) + for i = 1:60:l + if i+59 > l + write(outio, @view br[i:end]) + else + write(outio, @view br[i:i+59]) + end + write(outio, '\n') + end +end + +function perf_revcomp(io=stdin, outio = stdout) + buff = PushVector{UInt8}() + while true + line = codeunits(readline(io)) + if isempty(line) + print_buff(outio, buff) + return + elseif first(line) == UInt8('>') + print_buff(outio, buff) + resize!(buff, 0) + write(outio, line) + write(outio, '\n') + else + l = length(line) + @inbounds for c in line + push!(buff, revcompdata[c%Int]) + end + end + end +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + perf_revcomp() + return 0 +end + +open(joinpath(@__DIR__, "revcomp-input.txt")) do io + perf_revcomp(io, IOBuffer()) +end diff --git a/spectralnorm/cmain.jl b/spectralnorm/cmain.jl new file mode 100644 index 0000000..e6cc687 --- /dev/null +++ b/spectralnorm/cmain.jl @@ -0,0 +1,61 @@ +# The Computer Language Benchmarks Game +# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ + +# based on the Javascript program +# optimizations by Kristoffer Carlsson + +using Printf + +A(i,j) = 1.0 / ( (((i+j)*(i+j+1)) >> 1) + i+1) + +function Au!(w, u) + n = length(u) + Threads.@threads for i = 1:n + w[i] = 0 + z = 0.0 + @simd for j = 1:n + @inbounds z += A(i-1, j-1) * u[j] + end + w[i] = z + end +end + +function Atu!(v, w) + n = length(w) + Threads.@threads for i = 1:n + z = 0.0 + @simd for j = 1:n + @inbounds z += A(j-1,i-1) * w[j] + end + v[i] = z + end +end + +function AtAu!(w, v, u) + Au!(w, u) + Atu!(v, w) +end + +function perf_spectralnorm(n::Int=100) + u = ones(Float64, n) + v = zeros(Float64 ,n) + w = zeros(Float64, n) + vv = vBv = 0 + for i = 1:10 + AtAu!(w, v, u) + AtAu!(w, u, v) + end + for i = 1:n + vBv += u[i]*v[i] + vv += v[i]*v[i] + end + return sqrt(vBv/vv) +end +Base.@ccallable function julia_main(ARGS::Vector{String})::Cint + + n = parse(Int,ARGS[1]) + @printf("%.9f\n", perf_spectralnorm(n)) + return 0 +end + +julia_main(["5500"])