@@ -1807,6 +1807,8 @@ def __init__(
18071807 self .num_samples = variant_data .num_samples
18081808 self .num_threads = num_threads
18091809 self .mmap_temp_file = None
1810+ self .sites_position = None
1811+ self .terminal_position = None
18101812 mmap_fd = - 1
18111813
18121814 genotype_matrix_size = self .max_sites * self .num_samples
@@ -1865,6 +1867,8 @@ def add_sites(self, exclude_positions=None):
18651867 logger .info (f"Starting addition of { self .max_sites } sites" )
18661868 progress = self .progress_monitor .get ("ga_add_sites" , self .max_sites )
18671869 inference_site_id = []
1870+ last_position = 0
1871+
18681872 for variant in self .variant_data .variants (recode_ancestral = True ):
18691873 # If there's missing data the last allele is None
18701874 num_alleles = len (variant .alleles ) - int (variant .alleles [- 1 ] is None )
@@ -1879,6 +1883,7 @@ def add_sites(self, exclude_positions=None):
18791883 and site .ancestral_state is not None
18801884 ):
18811885 use_site = True
1886+ last_position = site .position
18821887 time = site .time
18831888 if tskit .is_unknown_time (time ):
18841889 # Non-variable sites have no obvious freq-as-time values
@@ -1888,12 +1893,18 @@ def add_sites(self, exclude_positions=None):
18881893 if np .isnan (time ):
18891894 use_site = False # Site with meaningless time value: skip inference
18901895 if use_site :
1891- self .ancestor_builder .add_site (time , variant .genotypes )
1896+ self .ancestor_builder .add_site (time , variant .genotypes , terminal = False )
18921897 inference_site_id .append (site .id )
18931898 self .num_sites += 1
18941899 progress .update ()
18951900 progress .close ()
18961901 self .inference_site_ids = inference_site_id
1902+ # Add terminal site at end of sequence
1903+ zeros = np .zeros (self .num_samples , dtype = np .int8 )
1904+ self .ancestor_builder .add_site (tskit .UNKNOWN_TIME , zeros , terminal = True )
1905+ self .num_sites += 1
1906+ self .terminal_position = np .array ([last_position + 1 ], dtype = np .float64 )
1907+
18971908 logger .info ("Finished adding sites" )
18981909
18991910 def _run_synchronous (self , progress ):
@@ -2000,15 +2011,18 @@ def run(self):
20002011 if t not in self .timepoint_to_epoch :
20012012 self .timepoint_to_epoch [t ] = len (self .timepoint_to_epoch ) + 1
20022013 self .ancestor_data = formats .AncestorData (
2003- self .variant_data .sites_position [:][self .inference_site_ids ],
2004- self .variant_data .sequence_length ,
2014+ inference_position = self .variant_data .sites_position [:][
2015+ self .inference_site_ids
2016+ ],
2017+ terminal_position = self .terminal_position ,
2018+ sequence_length = self .variant_data .sequence_length ,
20052019 path = self .ancestor_data_path ,
20062020 ** self .ancestor_data_kwargs ,
20072021 )
20082022 if self .num_ancestors > 0 :
20092023 logger .info (f"Starting build for { self .num_ancestors } ancestors" )
20102024 progress = self .progress_monitor .get ("ga_generate" , self .num_ancestors )
2011- a = np .zeros (self .num_sites , dtype = np .int8 )
2025+ a = np .zeros (self .num_sites - 1 , dtype = np .int8 )
20122026 root_time = max (self .timepoint_to_epoch .keys ())
20132027 av_timestep = root_time / len (self .timepoint_to_epoch )
20142028 root_time += av_timestep # Add a root a bit older than the oldest ancestor
@@ -2017,15 +2031,15 @@ def run(self):
20172031 # line up. It's normally removed when processing the final tree sequence.
20182032 self .ancestor_data .add_ancestor (
20192033 start = 0 ,
2020- end = self .num_sites ,
2034+ end = self .num_sites - 1 ,
20212035 time = root_time + av_timestep ,
20222036 focal_sites = np .array ([], dtype = np .int32 ),
20232037 haplotype = a ,
20242038 )
20252039 # This is the the "ultimate ancestor" of all zeros
20262040 self .ancestor_data .add_ancestor (
20272041 start = 0 ,
2028- end = self .num_sites ,
2042+ end = self .num_sites - 1 ,
20292043 time = root_time ,
20302044 focal_sites = np .array ([], dtype = np .int32 ),
20312045 haplotype = a ,
0 commit comments