Skip to content

Example: diploid.cc

Kevin R. Thornton edited this page Jun 9, 2014 · 1 revision

Taken from fwdpp 0.2.3:

/*! \include diploid.cc
  Simulate a single, finite Wright-Fisher diploid population with mutation, recombination, and no selection.

  This program illustrates many features of fwdpp:
  1.  Custom mutation classes
  2.  Implementing a mutation model (infinitely-many sites)
  3.  Iterating a population through its life cycle
  4.  Outputting a sample in "ms" format
 */

#include <fwdpp/diploid.hh>
#include <Sequence/SimData.hpp>
#include <boost/unordered_set.hpp>

#include <boost/container/list.hpp>
#include <boost/container/vector.hpp>
#include <boost/pool/pool_alloc.hpp>

/*
  We define a new mutation type, derived from the base class KTfwd::mutation_base.
  This type adds a selection coefficient (s), dominance coefficient (h),
  and the generation when the mutation first appeared in the population (g)
 */
struct mutation_with_age : public KTfwd::mutation_base
{
  unsigned g;
  double s,h;
  /*
    The constructor initializes all class data, including that of the base class via a constructor
    call to the base class.
   */
  mutation_with_age(const unsigned & __o,const double & position, const unsigned & count, const bool & isneutral = true)
    : KTfwd::mutation_base(position,count,isneutral),g(__o),s(0.),h(0.)
  {	
  }
};

//Simplify our lives a bit with typedefs

/*
  NOTE: we are using boost containers with boost memory allocators.  
  My experience on my systems is that each of those objects is worth a 5-10% 
  run time speedup compare to the same objects from namespace std.

  fwdpp is compatible with ANY container system as long as the
  one containing gametes conforms to the behavior of std::vector and 
  the one containing mutations conforms to the behavior of std::list 
  (especially w/regards to no pointer invalidation upon insertion/delete!!)
*/

//compiling the code with -DUSE_STANDARD_CONTAINERS will use std::vector and std::list instead of the boost alternatives
typedef mutation_with_age mtype;
#ifndef USE_STANDARD_CONTAINERS
typedef boost::pool_allocator<mtype> mut_allocator;
typedef boost::container::list<mtype,mut_allocator > mlist;
typedef KTfwd::gamete_base<mtype,mlist> gtype;
typedef boost::pool_allocator<gtype> gam_allocator;
typedef boost::container::vector<gtype,gam_allocator > gvector;
#else
typedef std::list<mtype> mlist;
typedef KTfwd::gamete_base<mtype,mlist> gtype;
typedef std::vector<gtype > gvector;
#endif


/*
  We wish to do mutations under the infinitely-many sites assumption.  That means that
  a new mutation cannot appear at any previously-mutated site.  Here, we cheat a little
  and do not allow mutations at any sites that are currently polymorphic.

  We accomplish this via a lookup table of the current mutation positions.  The function object
  KTfwd::equal_eps is used as a replacement for std::operator==(double,double) in order to ensure
  that values differing by <= DBL_EPSILON (~10^-17 on most systems) are not allowed, as they cause problems.
 */
typedef boost::unordered_set<double,boost::hash<double>,KTfwd::equal_eps > lookup_table_type;

/*
  This function generates neutral mutations under the infinitely-many sites assumption
 */
mutation_with_age neutral_mutations_inf_sites(gsl_rng * r,const unsigned & generation,mlist * mutations,
					      lookup_table_type * lookup)
{
  //Generate new mutation position on the interval [0,1)
  double pos = gsl_rng_uniform(r);
  /*
    An alternative implementation of the while loop below would be:
    while( std::find_if( mutations->begin(),mutations->end(),boost::bind(KTfwd::mutation_at_pos(),_1,pos) != mutations->end()) )
    {
      pos = gsl_rng_uniform(r);
    }

    However, that operation is typically much slower, esp. as the mutation rate gets higher
   */
  while( lookup->find(pos) != lookup->end() ) //make sure it doesn't exist in the population
    { 
      pos = gsl_rng_uniform(r);  //if it does, generate a new one
    }
  //update the lookup table
  lookup->insert(pos);

  /*
    The program will abort if the following conditions are true:
    1.  The mutation position that we generated does indeed exist in the population (implying that our hashing scheme has failed).
    2.  AND, the program was NOT compiled with -DNDEBUG

    fwdpp makes extensive internal use of the C-language assert macro (#include <cassert>).  The macro is enabled if a program is 
    compiled like this:
    c++ -o program program.cc

    It is disabled if a program is compiled like this:
    c++ -DNDEBUG program program.cc

    Thus, a programmer may make use of the preprocessor macro NDEBUG to write detailed debugging routines that are easily disabled.
    
    For example:
    #ifndef NDEBUG
    bool condition = false;
    //do some complex checking of your data here.  If all is okay, set condition = true;
    #endif
    assert( condition == true );

    That check may be expensive at run-time, but is easily disabled by recompiling using -DNDEBUG

    An example of such a debugging routine is KTfwd::check_sum
   */
  assert(std::find_if(mutations->begin(),mutations->end(),boost::bind(KTfwd::mutation_at_pos(),_1,pos)) == mutations->end());

  //return constructor call to mutation type
  return mutation_with_age(generation,pos,1,true);
}
 
int main(int argc, char ** argv)
{
  if (argc != 8)
    {
      std::cerr << "Too few arguments\n"
		<< "Usage: diploid N theta rho ngens samplesize nreps seed\n";
      exit(10);
    } 
  int argument=1;
  const unsigned N = atoi(argv[argument++]);           //Number of diploids
  const double theta = atof(argv[argument++]);         //4*n*mutation rate.  Note: mutation rate is per REGION, not SITE!!
  const double rho = atof(argv[argument++]);           //4*n*recombination rate.  Note: recombination rate is per REGION, not SITE!!
  const unsigned ngens = atoi(argv[argument++]);       //Number of generations to simulate
  const unsigned samplesize1 = atoi(argv[argument++]); //Sample size to draw from the population
  int nreps = atoi(argv[argument++]);                  //Number of replicates to simulate
  const unsigned seed = atoi(argv[argument++]);        //Random number seed

  const double mu = theta/double(4*N);                 //per-gamete mutation rate
  
  /*
    littler r is the recombination rate per region per generation.

    Typically, given the above definition of rho, this should be rho/(4*N).

    However, the implementation of KTfwd::recombine treats littler as
    per gamete rather than per-diploid.  

    This is a bit weird, but is transparent to any users of this program,
    and is just an internal detail.
   */
  const double littler = rho/double(8*N);
  
  //Write the command line to stderr
  std::copy(argv,argv+argc,std::ostream_iterator<char*>(std::cerr," "));
  std::cerr << '\n';
  
  //Initiate random number generation system
  gsl_rng * r =  gsl_rng_alloc(gsl_rng_ranlxs2);
  gsl_rng_set(r,seed);

  unsigned twoN = 2*N;

  while(nreps--)
    {
      //the population begins with 1 gamete with no mutations amd initial count 2N
      gvector gametes(1,gtype(twoN));
      gametes.reserve(twoN);
      mlist mutations;  //the population is devoid of mutations
      std::vector<mtype> fixations;  //store mutation that fix.  Passed to KTfwd::remove_fixed_lost
      std::vector<unsigned> fixation_times; //store when they fix.  Passed to KTfwd::remove_fixed_lost
      unsigned generation;
      double wbar;
      lookup_table_type lookup;  //this is our lookup table for the mutation model
      for( generation = 0; generation < ngens; ++generation )
	{
	  //Sample gamete frequencies proportional to fitness/(population mean fitness, aka wbar).  wbar is the return value
	  wbar = KTfwd::sample_diploid(r,&gametes,twoN,
				       /*
					 Fitness is multiplicative over sites.

					 The fitness model takes two gametes as arguments.  
					 The gametes are passed to this function by 
					 KTfwd::sample_diploid, and the _1 and _2 are placeholders for
					 those gametes (see documentation for boost/bind.hpp for details).
					 The 2. means that fitnesses are 1, 1+sh, and 1+2s for genotypes
					 AA, Aa, and aa, respectively, where a is a mutation with
					 selection coefficient s and dominance h, and the fitness of 
					 the diploid is the product of fitness over sites

					 There is no selection in this simulation, but this
					 function is called anyways to illustrate it as multiplicative
					 models are very common in population genetics
				       */
				       boost::bind(KTfwd::multiplicative_diploid(),_1,_2,2.), 
				       /*
					 For each gamete still extant afte sampling,
					 remove the pointers to any mutations that have 
					 been fixed or lost from the population.
					 
					 For more complex models such as quantitative
					 traits evolving towards an optimum, one may not
					 with to remove fixations.  In that case,
					 replace the line below with
					 boost::bind(KTfwd::mutation_remover(),_1,twoN));

					 Under multiplicative fitness and Wright-Fisher sampling
					 proportional to relative fitness, fixed mutations
					 are just a constant applied to everyone's fitness, so we 
					 can remove them, making the simulation faster, etc.
				       */
				       boost::bind(KTfwd::mutation_remover(),_1,0,twoN));       

	  /*
	    Clean up the mutation list to remove any mutations fixed or lost.

	    If you do not want to remove fixed mutations (in the case of models
	    where fixed mutations still matter, such as evolution towards an optimum), 
	    replace the line below with:
	    KTfwd::remove_lost(&mutations);
	  */
	  KTfwd::remove_fixed_lost(&mutations,&fixations,&fixation_times,&lookup,generation,twoN);
	  assert( lookup.size() == mutations.size() );

	  //if all is going well, the (sum of the counts of each gamete) == twoN, otherwise we have a problem and must abort.
	  assert(KTfwd::check_sum(gametes,twoN));



	  //Mutate the gamete pool
	  unsigned nmuts = KTfwd::mutate(r,&gametes,&mutations,mu,
					 /*
					   The mutation model (defined above) will pass each gamete
					   to be mutated to the mutation model function.  Again, _1
					   is used as a placeholder for that gamete.
					 */
					 boost::bind(neutral_mutations_inf_sites,r,generation,_1,&lookup),
					 /*
					   Policy telling KTfwd::mutate how to add mutated gametes into the gamete pool.
					   If mutation results in a new gamete, add that gamete to the 
					   end of gametes. This is always the case under infinitely-many sites,
					   but for other mutation models, mutation may result in a new
					   copy identical to an existing gamete.  If so,
					   that gamete's frequency increases by 1.
					  */
					 boost::bind(KTfwd::push_at_end<gtype,gvector>,_1,_2),
					 /*
					   Policy to insert new mutations at the end of the mutations list
					 */
					 boost::bind(KTfwd::insert_at_end<mtype,mlist>,_1,_2));
	  assert( lookup.size() == mutations.size() );
	  assert(KTfwd::check_sum(gametes,twoN));

	  //Recombination
	  //unsigned nrec = KTfwd::recombine_test_not_inserting_as_much(r, 
	  unsigned nrec = KTfwd::recombine(r, 
					   &gametes,
					   twoN, 
					   littler, 
					   //This is the genetic map: uniform along [0,1)
					   boost::bind( gsl_rng_uniform,r ) );
	  assert(KTfwd::check_sum(gametes,twoN));
	}
      Sequence::SimData sdata;

      std::vector<std::pair<double,std::string> > mslike = KTfwd::ms_sample(r,gametes,samplesize1,twoN,true);

      if(!mslike.empty())
	{
	  sdata.assign(mslike.begin(),mslike.end());
	  std::cout << sdata << '\n';
	}
      else
	{
	  std::cout << "//\nsegsites: 0\n";
	}      
    }
}
Clone this wiki locally