From c585106da0b8b531c58a17c627e919c77429710d Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 7 Feb 2018 15:57:27 +0000 Subject: [PATCH 1/3] Reformat in line with the Google Java Style Guide. --- .../java/htsjdk/samtools/LinearBAMIndex.java | 21 +- .../java/htsjdk/samtools/SAMRecordHelper.java | 5 +- .../seqdoop/hadoop_bam/AnySAMInputFormat.java | 432 +++--- .../hadoop_bam/AnySAMOutputFormat.java | 59 +- .../seqdoop/hadoop_bam/BAMInputFormat.java | 1244 ++++++++--------- .../seqdoop/hadoop_bam/BAMOutputFormat.java | 23 +- .../seqdoop/hadoop_bam/BAMRecordReader.java | 372 ++--- .../seqdoop/hadoop_bam/BAMRecordWriter.java | 216 ++- .../seqdoop/hadoop_bam/BAMSplitGuesser.java | 708 +++++----- .../seqdoop/hadoop_bam/BCFRecordReader.java | 403 +++--- .../seqdoop/hadoop_bam/BCFRecordWriter.java | 256 ++-- .../seqdoop/hadoop_bam/BCFSplitGuesser.java | 819 +++++------ .../seqdoop/hadoop_bam/BaseSplitGuesser.java | 162 ++- .../seqdoop/hadoop_bam/CRAMInputFormat.java | 59 +- .../seqdoop/hadoop_bam/CRAMOutputFormat.java | 7 +- .../seqdoop/hadoop_bam/CRAMRecordReader.java | 12 +- .../seqdoop/hadoop_bam/CRAMRecordWriter.java | 180 ++- .../seqdoop/hadoop_bam/FastaInputFormat.java | 665 +++++---- .../seqdoop/hadoop_bam/FastqInputFormat.java | 725 +++++----- .../seqdoop/hadoop_bam/FastqOutputFormat.java | 274 ++-- .../seqdoop/hadoop_bam/FileVirtualSplit.java | 206 +-- .../seqdoop/hadoop_bam/FormatConstants.java | 63 +- .../seqdoop/hadoop_bam/FormatException.java | 14 +- .../KeyIgnoringAnySAMOutputFormat.java | 175 ++- .../KeyIgnoringBAMOutputFormat.java | 97 +- .../KeyIgnoringBAMRecordWriter.java | 41 +- .../KeyIgnoringBCFRecordWriter.java | 54 +- .../KeyIgnoringCRAMOutputFormat.java | 95 +- .../KeyIgnoringCRAMRecordWriter.java | 39 +- .../KeyIgnoringSAMRecordWriter.java | 53 +- .../KeyIgnoringVCFOutputFormat.java | 226 +-- .../KeyIgnoringVCFRecordWriter.java | 54 +- .../hadoop_bam/LazyBAMRecordFactory.java | 194 ++- .../hadoop_bam/LazyBCFGenotypesContext.java | 224 +-- .../LazyParsingGenotypesContext.java | 53 +- .../hadoop_bam/LazyVCFGenotypesContext.java | 178 +-- .../org/seqdoop/hadoop_bam/LineReader.java | 156 +-- .../seqdoop/hadoop_bam/QseqInputFormat.java | 795 +++++------ .../seqdoop/hadoop_bam/QseqOutputFormat.java | 305 ++-- .../seqdoop/hadoop_bam/ReferenceFragment.java | 218 +-- .../org/seqdoop/hadoop_bam/SAMFormat.java | 72 +- .../seqdoop/hadoop_bam/SAMInputFormat.java | 39 +- .../seqdoop/hadoop_bam/SAMRecordReader.java | 604 ++++---- .../seqdoop/hadoop_bam/SAMRecordWritable.java | 79 +- .../seqdoop/hadoop_bam/SAMRecordWriter.java | 113 +- .../seqdoop/hadoop_bam/SequencedFragment.java | 824 ++++++----- .../seqdoop/hadoop_bam/SplittingBAMIndex.java | 248 ++-- .../hadoop_bam/SplittingBAMIndexer.java | 648 +++++---- .../org/seqdoop/hadoop_bam/VCFFormat.java | 100 +- .../seqdoop/hadoop_bam/VCFInputFormat.java | 830 +++++------ .../seqdoop/hadoop_bam/VCFOutputFormat.java | 59 +- .../seqdoop/hadoop_bam/VCFRecordReader.java | 312 ++--- .../seqdoop/hadoop_bam/VCFRecordWriter.java | 254 ++-- .../hadoop_bam/VariantContextCodec.java | 621 ++++---- .../hadoop_bam/VariantContextWithHeader.java | 17 +- .../hadoop_bam/VariantContextWritable.java | 67 +- .../hadoop_bam/util/BGZFBlockIndex.java | 182 +-- .../hadoop_bam/util/BGZFBlockIndexer.java | 373 +++-- .../seqdoop/hadoop_bam/util/BGZFCodec.java | 31 +- .../util/BGZFCompressionOutputStream.java | 14 +- .../util/BGZFEnhancedGzipCodec.java | 59 +- .../util/BGZFSplitCompressionInputStream.java | 38 +- .../util/BGZFSplitFileInputFormat.java | 235 ++-- .../hadoop_bam/util/BGZFSplitGuesser.java | 274 ++-- .../seqdoop/hadoop_bam/util/ConfHelper.java | 75 +- .../hadoop_bam/util/DataInputWrapper.java | 49 +- .../hadoop_bam/util/DataOutputWrapper.java | 22 +- .../hadoop_bam/util/GetSortedBAMHeader.java | 49 +- .../seqdoop/hadoop_bam/util/IntervalUtil.java | 96 +- .../seqdoop/hadoop_bam/util/MurmurHash3.java | 362 +++-- .../seqdoop/hadoop_bam/util/NIOFileUtil.java | 67 +- .../hadoop_bam/util/SAMFileMerger.java | 74 +- .../hadoop_bam/util/SAMHeaderReader.java | 108 +- .../hadoop_bam/util/SAMOutputPreparer.java | 209 ++- .../hadoop_bam/util/VCFFileMerger.java | 56 +- .../hadoop_bam/util/VCFHeaderReader.java | 63 +- .../seqdoop/hadoop_bam/util/WrapSeekable.java | 123 +- .../org/seqdoop/hadoop_bam/BAMTestUtil.java | 50 +- .../seqdoop/hadoop_bam/IntervalUtilTest.java | 109 +- .../hadoop_bam/TestAnySAMInputFormat.java | 41 +- .../hadoop_bam/TestBAMInputFormat.java | 52 +- .../hadoop_bam/TestBAMOutputFormat.java | 550 ++++---- .../hadoop_bam/TestBAMSplitGuesser.java | 4 +- .../hadoop_bam/TestBGZFSplitGuesser.java | 17 +- .../hadoop_bam/TestCRAMInputFormat.java | 25 +- .../hadoop_bam/TestCRAMInputFormatOnHDFS.java | 91 +- .../hadoop_bam/TestCRAMOutputFormat.java | 510 ++++--- .../seqdoop/hadoop_bam/TestConfHelper.java | 101 +- .../hadoop_bam/TestFastaInputFormat.java | 68 +- .../hadoop_bam/TestFastqInputFormat.java | 1070 +++++++------- .../hadoop_bam/TestFastqOutputFormat.java | 337 +++-- .../seqdoop/hadoop_bam/TestLineReader.java | 95 +- .../hadoop_bam/TestQseqInputFormat.java | 698 +++++---- .../hadoop_bam/TestQseqOutputFormat.java | 271 ++-- .../org/seqdoop/hadoop_bam/TestSAMFormat.java | 8 +- .../hadoop_bam/TestSAMHeaderReader.java | 79 +- .../hadoop_bam/TestSAMInputFormat.java | 13 +- .../hadoop_bam/TestSequencedFragment.java | 654 +++++---- .../hadoop_bam/TestSplittingBAMIndexer.java | 18 +- .../org/seqdoop/hadoop_bam/TestVCFFormat.java | 8 +- .../hadoop_bam/TestVCFInputFormat.java | 282 ++-- .../TestVCFInputFormatStringency.java | 101 +- .../hadoop_bam/TestVCFOutputFormat.java | 291 ++-- .../seqdoop/hadoop_bam/TestVCFRoundTrip.java | 365 ++--- .../hadoop_bam/util/TestVCFHeaderReader.java | 16 +- src/test/resources/log4j.properties | 10 +- 106 files changed, 11803 insertions(+), 11459 deletions(-) diff --git a/src/main/java/htsjdk/samtools/LinearBAMIndex.java b/src/main/java/htsjdk/samtools/LinearBAMIndex.java index 7e80846..dbbedda 100644 --- a/src/main/java/htsjdk/samtools/LinearBAMIndex.java +++ b/src/main/java/htsjdk/samtools/LinearBAMIndex.java @@ -19,21 +19,16 @@ // IN THE SOFTWARE. package htsjdk.samtools; -import htsjdk.samtools.CachingBAMFileIndex; -import htsjdk.samtools.LinearIndex; -import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.seekablestream.SeekableStream; -/** - * The htsjdk APIs for accessing the linear BAM index are private... - */ +/** The htsjdk APIs for accessing the linear BAM index are private... */ public class LinearBAMIndex extends CachingBAMFileIndex { - public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) { - super(stream, dict); - } - - public LinearIndex getLinearIndex(int idx) { - return getQueryResults(idx).getLinearIndex(); - } + public LinearBAMIndex(SeekableStream stream, SAMSequenceDictionary dict) { + super(stream, dict); + } + + public LinearIndex getLinearIndex(int idx) { + return getQueryResults(idx).getLinearIndex(); + } } diff --git a/src/main/java/htsjdk/samtools/SAMRecordHelper.java b/src/main/java/htsjdk/samtools/SAMRecordHelper.java index 35184f0..ea02e94 100644 --- a/src/main/java/htsjdk/samtools/SAMRecordHelper.java +++ b/src/main/java/htsjdk/samtools/SAMRecordHelper.java @@ -1,10 +1,11 @@ package htsjdk.samtools; /** - * This class is required in order to access the protected - * {@link SAMRecord#eagerDecode()} method in HTSJDK. + * This class is required in order to access the protected {@link SAMRecord#eagerDecode()} method in + * HTSJDK. */ public class SAMRecordHelper { + public static void eagerDecode(SAMRecord record) { record.eagerDecode(); } diff --git a/src/main/java/org/seqdoop/hadoop_bam/AnySAMInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/AnySAMInputFormat.java index 8d871e6..68d38f0 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/AnySAMInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/AnySAMInputFormat.java @@ -27,7 +27,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -40,218 +39,231 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; -/** An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM, BAM, and CRAM files. - * Values are the individual records; see {@link BAMRecordReader} for the - * meaning of the key. +/** + * An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM, BAM, and CRAM files. Values are the + * individual records; see {@link BAMRecordReader} for the meaning of the key. * - *

By default, files are recognized as SAM, BAM, or CRAM based on their file - * extensions: see {@link #TRUST_EXTS_PROPERTY}. If that fails, or this - * behaviour is disabled, the first byte of each file is read to determine the - * file type.

+ *

By default, files are recognized as SAM, BAM, or CRAM based on their file extensions: see + * {@link #TRUST_EXTS_PROPERTY}. If that fails, or this behaviour is disabled, the first byte of + * each file is read to determine the file type. */ -public class AnySAMInputFormat - extends FileInputFormat -{ - /** A Boolean property: are file extensions trusted? The default is - * true. - * - * @see SAMFormat#inferFromFilePath - */ - public static final String TRUST_EXTS_PROPERTY = - "hadoopbam.anysam.trust-exts"; - - private final BAMInputFormat bamIF = new BAMInputFormat(); - private final CRAMInputFormat cramIF = new CRAMInputFormat(); - private final SAMInputFormat samIF = new SAMInputFormat(); - - private final Map formatMap; - private final boolean givenMap; - - private Configuration conf; - - /** Creates a new input format, which will use the - * Configuration from the first public method called. Thus this - * will behave as though constructed with a Configuration - * directly, but only after it has received it in - * createRecordReader (via the TaskAttemptContext) - * or isSplitable or getSplits (via the - * JobContext). Until then, other methods will throw an {@link - * IllegalStateException}. - * - * This constructor exists mainly as a convenience, e.g. so that - * AnySAMInputFormat can be used directly in - * Job.setInputFormatClass. - */ - public AnySAMInputFormat() { - this(null, new HashMap<>(), false); - } - - /** Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from - * the given Configuration. - */ - public AnySAMInputFormat(Configuration conf) { - this(conf, new HashMap<>(), false); - } - - private static boolean trustExtensions(Configuration conf) { - return conf.getBoolean(TRUST_EXTS_PROPERTY, true); - } - - /** Creates a new input format, trusting the given Map to - * define the file-to-format associations. Neither file paths nor their - * contents are looked at, only the Map is used. - * - *

The Map is not copied, so it should not be modified while - * this input format is in use!

- * */ - public AnySAMInputFormat(Map formatMap) { - this(null, formatMap, true); - } - - private AnySAMInputFormat(Configuration conf, Map formatMap, boolean givenMap){ - this.formatMap = formatMap; - this.givenMap = givenMap; - this.conf = conf; - } - - /** Returns the {@link SAMFormat} corresponding to the given path. Returns - * null if it cannot be determined even based on the file - * contents (unless future SAM/BAM formats are very different, this means - * that the path does not refer to a SAM or BAM file). - * - *

If this input format was constructed using a given - * Map<Path,SAMFormat> and the path is not contained - * within that map, throws an {@link IllegalArgumentException}.

- */ - public SAMFormat getFormat(final Path path) throws PathNotFoundException { - SAMFormat fmt = formatMap.get(path); - if (fmt != null || formatMap.containsKey(path)) - return fmt; - - if (givenMap) - throw new IllegalArgumentException( - "SAM format for '"+path+"' not in given map"); - - if (this.conf == null) - throw new IllegalStateException("Don't have a Configuration yet"); - - if (trustExtensions(conf)) { - final SAMFormat f = SAMFormat.inferFromFilePath(path); - if (f != null) { - formatMap.put(path, f); - return f; - } - } - - try { - FileSystem fileSystem = path.getFileSystem(conf); - if (!fileSystem.exists(path)) { - throw new PathNotFoundException(path.toString()); - } - fmt = SAMFormat.inferFromData(fileSystem.open(path)); - } catch (IOException e) {} - - formatMap.put(path, fmt); - return fmt; - } - - /** Returns a {@link BAMRecordReader} or {@link SAMRecordReader} as - * appropriate, initialized with the given parameters. - * - *

Throws {@link IllegalArgumentException} if the given input split is - * not a {@link FileVirtualSplit} (used by {@link BAMInputFormat}) or a - * {@link FileSplit} (used by {@link SAMInputFormat}), or if the path - * referred to is not recognized as a SAM, BAM, or CRAM file (see {@link - * #getFormat}).

- */ - @Override public RecordReader - createRecordReader(InputSplit split, TaskAttemptContext ctx) - throws InterruptedException, IOException - { - final Path path; - if (split instanceof FileSplit) - path = ((FileSplit)split).getPath(); - else if (split instanceof FileVirtualSplit) - path = ((FileVirtualSplit)split).getPath(); - else - throw new IllegalArgumentException( - "split '"+split+"' has unknown type: cannot extract path"); - - if (this.conf == null) - this.conf = ctx.getConfiguration(); - - final SAMFormat fmt = getFormat(path); - if (fmt == null) - throw new IllegalArgumentException( - "unknown SAM format, cannot create RecordReader: "+path); - - switch (fmt) { - case SAM: return samIF.createRecordReader(split, ctx); - case BAM: return bamIF.createRecordReader(split, ctx); - case CRAM: return cramIF.createRecordReader(split, ctx); - default: assert false; return null; - } - } - - /** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or - * {@link SAMInputFormat} as appropriate for the given path. - */ - @Override public boolean isSplitable(JobContext job, Path path) { - if (this.conf == null) - this.conf = job.getConfiguration(); - - try { - final SAMFormat fmt = getFormat(path); - if (fmt == null) +public class AnySAMInputFormat extends FileInputFormat { + + /** + * A Boolean property: are file extensions trusted? The default is true. + * + * @see SAMFormat#inferFromFilePath + */ + public static final String TRUST_EXTS_PROPERTY = "hadoopbam.anysam.trust-exts"; + + private final BAMInputFormat bamIF = new BAMInputFormat(); + private final CRAMInputFormat cramIF = new CRAMInputFormat(); + private final SAMInputFormat samIF = new SAMInputFormat(); + + private final Map formatMap; + private final boolean givenMap; + + private Configuration conf; + + /** + * Creates a new input format, which will use the Configuration from the first public + * method called. Thus this will behave as though constructed with a Configuration + * directly, but only after it has received it in createRecordReader (via the + * TaskAttemptContext) or isSplitable or getSplits (via the + * JobContext). Until then, other methods will throw an {@link + * IllegalStateException}. + * + *

This constructor exists mainly as a convenience, e.g. so that AnySAMInputFormat + * can be used directly in Job.setInputFormatClass. + */ + public AnySAMInputFormat() { + this(null, new HashMap<>(), false); + } + + /** + * Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from the given + * Configuration. + */ + public AnySAMInputFormat(Configuration conf) { + this(conf, new HashMap<>(), false); + } + + /** + * Creates a new input format, trusting the given Map to define the file-to-format + * associations. Neither file paths nor their contents are looked at, only the Map is + * used. + * + *

The Map is not copied, so it should not be modified while this input format is + * in use! + */ + public AnySAMInputFormat(Map formatMap) { + this(null, formatMap, true); + } + + private AnySAMInputFormat(Configuration conf, Map formatMap, boolean givenMap) { + this.formatMap = formatMap; + this.givenMap = givenMap; + this.conf = conf; + } + + private static boolean trustExtensions(Configuration conf) { + return conf.getBoolean(TRUST_EXTS_PROPERTY, true); + } + + /** + * Returns the {@link SAMFormat} corresponding to the given path. Returns null if it + * cannot be determined even based on the file contents (unless future SAM/BAM formats are very + * different, this means that the path does not refer to a SAM or BAM file). + * + *

If this input format was constructed using a given Map<Path,SAMFormat> + * and the path is not contained within that map, throws an {@link IllegalArgumentException}. + */ + public SAMFormat getFormat(final Path path) throws PathNotFoundException { + SAMFormat fmt = formatMap.get(path); + if (fmt != null || formatMap.containsKey(path)) { + return fmt; + } + + if (givenMap) { + throw new IllegalArgumentException("SAM format for '" + path + "' not in given map"); + } + + if (this.conf == null) { + throw new IllegalStateException("Don't have a Configuration yet"); + } + + if (trustExtensions(conf)) { + final SAMFormat f = SAMFormat.inferFromFilePath(path); + if (f != null) { + formatMap.put(path, f); + return f; + } + } + + try { + FileSystem fileSystem = path.getFileSystem(conf); + if (!fileSystem.exists(path)) { + throw new PathNotFoundException(path.toString()); + } + fmt = SAMFormat.inferFromData(fileSystem.open(path)); + } catch (IOException e) { + } + + formatMap.put(path, fmt); + return fmt; + } + + /** + * Returns a {@link BAMRecordReader} or {@link SAMRecordReader} as appropriate, initialized with + * the given parameters. + * + *

Throws {@link IllegalArgumentException} if the given input split is not a {@link + * FileVirtualSplit} (used by {@link BAMInputFormat}) or a {@link FileSplit} (used by {@link + * SAMInputFormat}), or if the path referred to is not recognized as a SAM, BAM, or CRAM file (see + * {@link #getFormat}). + */ + @Override + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { + final Path path; + if (split instanceof FileSplit) { + path = ((FileSplit) split).getPath(); + } else if (split instanceof FileVirtualSplit) { + path = ((FileVirtualSplit) split).getPath(); + } else { + throw new IllegalArgumentException( + "split '" + split + "' has unknown type: cannot extract path"); + } + + if (this.conf == null) { + this.conf = ctx.getConfiguration(); + } + + final SAMFormat fmt = getFormat(path); + if (fmt == null) { + throw new IllegalArgumentException("unknown SAM format, cannot create RecordReader: " + path); + } + + switch (fmt) { + case SAM: + return samIF.createRecordReader(split, ctx); + case BAM: + return bamIF.createRecordReader(split, ctx); + case CRAM: + return cramIF.createRecordReader(split, ctx); + default: + assert false; + return null; + } + } + + /** + * Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or {@link SAMInputFormat} as + * appropriate for the given path. + */ + @Override + public boolean isSplitable(JobContext job, Path path) { + if (this.conf == null) { + this.conf = job.getConfiguration(); + } + + try { + final SAMFormat fmt = getFormat(path); + if (fmt == null) { return super.isSplitable(job, path); + } - switch (fmt) { - case SAM: return samIF.isSplitable(job, path); - case BAM: return bamIF.isSplitable(job, path); - case CRAM: return cramIF.isSplitable(job, path); - default: assert false; return false; + switch (fmt) { + case SAM: + return samIF.isSplitable(job, path); + case BAM: + return bamIF.isSplitable(job, path); + case CRAM: + return cramIF.isSplitable(job, path); + default: + assert false; + return false; + } + } catch (PathNotFoundException e) { + return super.isSplitable(job, path); + } + } + + /** + * Defers to {@link BAMInputFormat} or {@link CRAMInputFormat} as appropriate for each individual + * path. SAM paths do not require special handling, so their splits are left unchanged. + */ + @Override + public List getSplits(JobContext job) throws IOException { + if (this.conf == null) { + this.conf = job.getConfiguration(); + } + + final List origSplits = BAMInputFormat.removeIndexFiles(super.getSplits(job)); + + // We have to partition the splits by input format and hand them over to + // the *InputFormats for any further handling. + // + // BAMInputFormat and CRAMInputFormat need to change the split boundaries, so we can + // just extract the BAM and CRAM ones and leave the rest as they are. + + final List bamOrigSplits = new ArrayList(origSplits.size()), + cramOrigSplits = new ArrayList(origSplits.size()), + newSplits = new ArrayList(origSplits.size()); + + for (final InputSplit iSplit : origSplits) { + final FileSplit split = (FileSplit) iSplit; + + if (SAMFormat.BAM.equals(getFormat(split.getPath()))) { + bamOrigSplits.add(split); + } else if (SAMFormat.CRAM.equals(getFormat(split.getPath()))) { + cramOrigSplits.add(split); + } else { + newSplits.add(split); } - } catch (PathNotFoundException e) { - return super.isSplitable(job, path); - } - } - - /** Defers to {@link BAMInputFormat} or {@link CRAMInputFormat} as appropriate for each - * individual path. SAM paths do not require special handling, so their splits are left - * unchanged. - */ - @Override public List getSplits(JobContext job) - throws IOException - { - if (this.conf == null) - this.conf = job.getConfiguration(); - - final List origSplits = - BAMInputFormat.removeIndexFiles(super.getSplits(job)); - - // We have to partition the splits by input format and hand them over to - // the *InputFormats for any further handling. - // - // BAMInputFormat and CRAMInputFormat need to change the split boundaries, so we can - // just extract the BAM and CRAM ones and leave the rest as they are. - - final List - bamOrigSplits = new ArrayList(origSplits.size()), - cramOrigSplits = new ArrayList(origSplits.size()), - newSplits = new ArrayList(origSplits.size()); - - for (final InputSplit iSplit : origSplits) { - final FileSplit split = (FileSplit)iSplit; - - if (SAMFormat.BAM.equals(getFormat(split.getPath()))) - bamOrigSplits.add(split); - else if (SAMFormat.CRAM.equals(getFormat(split.getPath()))) - cramOrigSplits.add(split); - else - newSplits.add(split); - } - newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); - newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); - return newSplits; - } + } + newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); + newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); + return newSplits; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java index 538b842..403ca13 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/AnySAMOutputFormat.java @@ -25,34 +25,35 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -/** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for SAM and - * BAM files. Only locks down the value type and stores the output format - * requested. +/** + * An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for SAM and BAM files. Only locks + * down the value type and stores the output format requested. */ -public abstract class AnySAMOutputFormat - extends FileOutputFormat -{ - /** A string property defining the output format to use. The value is read - * directly by {@link SAMFormat#valueOf}. - */ - public static final String OUTPUT_SAM_FORMAT_PROPERTY = - "hadoopbam.anysam.output-format"; - - protected SAMFormat format; - - /** Creates a new output format, reading {@link #OUTPUT_SAM_FORMAT_PROPERTY} - * from the given Configuration. - */ - protected AnySAMOutputFormat(Configuration conf) { - final String fmtStr = conf.get(OUTPUT_SAM_FORMAT_PROPERTY); - - format = fmtStr == null ? null : SAMFormat.valueOf(fmtStr); - } - - /** Creates a new output format for the given SAM format. */ - protected AnySAMOutputFormat(SAMFormat fmt) { - if (fmt == null) - throw new IllegalArgumentException("null SAMFormat"); - format = fmt; - } +public abstract class AnySAMOutputFormat extends FileOutputFormat { + + /** + * A string property defining the output format to use. The value is read directly by {@link + * SAMFormat#valueOf}. + */ + public static final String OUTPUT_SAM_FORMAT_PROPERTY = "hadoopbam.anysam.output-format"; + + protected SAMFormat format; + + /** + * Creates a new output format, reading {@link #OUTPUT_SAM_FORMAT_PROPERTY} from the given + * Configuration. + */ + protected AnySAMOutputFormat(Configuration conf) { + final String fmtStr = conf.get(OUTPUT_SAM_FORMAT_PROPERTY); + + format = fmtStr == null ? null : SAMFormat.valueOf(fmtStr); + } + + /** Creates a new output format for the given SAM format. */ + protected AnySAMOutputFormat(SAMFormat fmt) { + if (fmt == null) { + throw new IllegalArgumentException("null SAMFormat"); + } + format = fmt; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BAMInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/BAMInputFormat.java index 68a7640..147f7e4 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BAMInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BAMInputFormat.java @@ -36,33 +36,24 @@ import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.Locatable; +import java.io.IOException; +import java.nio.file.ProviderNotFoundException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; -import org.seqdoop.hadoop_bam.util.IntervalUtil; -import org.seqdoop.hadoop_bam.util.NIOFileUtil; -import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -import org.seqdoop.hadoop_bam.util.WrapSeekable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.ProviderNotFoundException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import htsjdk.samtools.seekablestream.SeekableStream; - -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; @@ -71,615 +62,624 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.seqdoop.hadoop_bam.util.IntervalUtil; +import org.seqdoop.hadoop_bam.util.NIOFileUtil; +import org.seqdoop.hadoop_bam.util.SAMHeaderReader; +import org.seqdoop.hadoop_bam.util.WrapSeekable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -/** An {@link org.apache.hadoop.mapreduce.InputFormat} for BAM files. Values - * are the individual records; see {@link BAMRecordReader} for the meaning of - * the key. +/** + * An {@link org.apache.hadoop.mapreduce.InputFormat} for BAM files. Values are the individual + * records; see {@link BAMRecordReader} for the meaning of the key. */ -public class BAMInputFormat - extends FileInputFormat -{ - private static final Logger logger = LoggerFactory.getLogger(BAMInputFormat.class); - - /** - * If set to true, only include reads that overlap the given intervals (if specified), - * and unplaced unmapped reads (if specified). For programmatic use - * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred. - */ - public static final String BOUNDED_TRAVERSAL_PROPERTY = "hadoopbam.bam.bounded-traversal"; - - /** - * If set to true, enables the use of BAM indices to calculate splits. - * For programmatic use - * {@link #setEnableBAISplitCalculator(Configuration, boolean)} should be preferred. - * By default, this split calculator is disabled in favor of the splitting-bai calculator. - */ - public static final String ENABLE_BAI_SPLIT_CALCULATOR = "hadoopbam.bam.enable-bai-splitter"; - - /** - * Filter by region, like -L in SAMtools. Takes a comma-separated - * list of intervals, e.g. chr1:1-20000,chr2:12000-20000. For - * programmatic use {@link #setIntervals(Configuration, List)} should be preferred. - */ - public static final String INTERVALS_PROPERTY = "hadoopbam.bam.intervals"; - - /** - * If set to true, include unplaced unmapped reads (that is, unmapped reads with no - * position). For programmatic use - * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred. - */ - public static final String TRAVERSE_UNPLACED_UNMAPPED_PROPERTY = "hadoopbam.bam.traverse-unplaced-unmapped"; - - /** - * Only include reads that overlap the given intervals. Unplaced unmapped reads are not - * included. - * @param conf the Hadoop configuration to set properties on - * @param intervals the intervals to filter by - * @param the {@link Locatable} type - */ - public static void setIntervals(Configuration conf, - List intervals) { - setTraversalParameters(conf, intervals, false); - } - - /** - * Enables or disables the split calculator that uses the BAM index to calculate splits. - */ - public static void setEnableBAISplitCalculator(Configuration conf, - boolean setEnabled) { - conf.setBoolean(ENABLE_BAI_SPLIT_CALCULATOR, setEnabled); +public class BAMInputFormat extends FileInputFormat { + + /** + * If set to true, only include reads that overlap the given intervals (if specified), and + * unplaced unmapped reads (if specified). For programmatic use {@link + * #setTraversalParameters(Configuration, List, boolean)} should be preferred. + */ + public static final String BOUNDED_TRAVERSAL_PROPERTY = "hadoopbam.bam.bounded-traversal"; + /** + * If set to true, enables the use of BAM indices to calculate splits. For programmatic use {@link + * #setEnableBAISplitCalculator(Configuration, boolean)} should be preferred. By default, this + * split calculator is disabled in favor of the splitting-bai calculator. + */ + public static final String ENABLE_BAI_SPLIT_CALCULATOR = "hadoopbam.bam.enable-bai-splitter"; + /** + * Filter by region, like -L in SAMtools. Takes a comma-separated list of intervals, + * e.g. chr1:1-20000,chr2:12000-20000. For programmatic use {@link + * #setIntervals(Configuration, List)} should be preferred. + */ + public static final String INTERVALS_PROPERTY = "hadoopbam.bam.intervals"; + /** + * If set to true, include unplaced unmapped reads (that is, unmapped reads with no position). For + * programmatic use {@link #setTraversalParameters(Configuration, List, boolean)} should be + * preferred. + */ + public static final String TRAVERSE_UNPLACED_UNMAPPED_PROPERTY = + "hadoopbam.bam.traverse-unplaced-unmapped"; + + private static final Logger logger = LoggerFactory.getLogger(BAMInputFormat.class); + + /** + * Only include reads that overlap the given intervals. Unplaced unmapped reads are not included. + * + * @param conf the Hadoop configuration to set properties on + * @param intervals the intervals to filter by + * @param the {@link Locatable} type + */ + public static void setIntervals(Configuration conf, List intervals) { + setTraversalParameters(conf, intervals, false); + } + + /** Enables or disables the split calculator that uses the BAM index to calculate splits. */ + public static void setEnableBAISplitCalculator(Configuration conf, boolean setEnabled) { + conf.setBoolean(ENABLE_BAI_SPLIT_CALCULATOR, setEnabled); + } + + /** + * Only include reads that overlap the given intervals (if specified) and unplaced unmapped reads + * (if true). + * + * @param conf the Hadoop configuration to set properties on + * @param intervals the intervals to filter by, or null if all reads are to be + * included (in which case traverseUnplacedUnmapped must be true) + * @param traverseUnplacedUnmapped whether to included unplaced unampped reads + * @param the {@link Locatable} type + */ + public static void setTraversalParameters( + Configuration conf, List intervals, boolean traverseUnplacedUnmapped) { + if (intervals == null && !traverseUnplacedUnmapped) { + throw new IllegalArgumentException("Traversing mapped reads only is not supported."); + } + conf.setBoolean(BOUNDED_TRAVERSAL_PROPERTY, true); + if (intervals != null) { + StringBuilder sb = new StringBuilder(); + for (Iterator it = intervals.iterator(); it.hasNext(); ) { + Locatable l = it.next(); + sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd())); + if (it.hasNext()) { + sb.append(","); + } + } + conf.set(INTERVALS_PROPERTY, sb.toString()); + } + conf.setBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, traverseUnplacedUnmapped); + } + + /** + * Reset traversal parameters so that all reads are included. + * + * @param conf the Hadoop configuration to set properties on + */ + public static void unsetTraversalParameters(Configuration conf) { + conf.unset(BOUNDED_TRAVERSAL_PROPERTY); + conf.unset(INTERVALS_PROPERTY); + conf.unset(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY); + } + + static boolean isBoundedTraversal(Configuration conf) { + return conf.getBoolean(BOUNDED_TRAVERSAL_PROPERTY, false) + || conf.get(INTERVALS_PROPERTY) != null; // backwards compatibility + } + + static boolean traverseUnplacedUnmapped(Configuration conf) { + return conf.getBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, false); + } + + static List getIntervals(Configuration conf) { + return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY); + } + + static Path getIdxPath(Path path) { + return path.suffix(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + } + + static List removeIndexFiles(List splits) { + // Remove any splitting bai files + return splits + .stream() + .filter( + split -> + !((FileSplit) split) + .getPath() + .getName() + .endsWith(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)) + .filter(split -> !((FileSplit) split).getPath().getName().endsWith(BAMIndex.BAMIndexSuffix)) + .collect(Collectors.toList()); + } + + static Path getBAIPath(Path path) { + return path.suffix(BAMIndex.BAMIndexSuffix); + } + + /** + * Converts a List of SimpleIntervals into the format required by the SamReader query API + * + * @param rawIntervals SimpleIntervals to be converted + * @return A sorted, merged list of QueryIntervals suitable for passing to the SamReader query API + */ + static QueryInterval[] prepareQueryIntervals( + final List rawIntervals, final SAMSequenceDictionary sequenceDictionary) { + if (rawIntervals == null || rawIntervals.isEmpty()) { + return null; + } + + // Convert each SimpleInterval to a QueryInterval + final QueryInterval[] convertedIntervals = + rawIntervals + .stream() + .map( + rawInterval -> + convertSimpleIntervalToQueryInterval(rawInterval, sequenceDictionary)) + .toArray(QueryInterval[]::new); + + // Intervals must be optimized (sorted and merged) in order to use the htsjdk query API + return QueryInterval.optimizeIntervals(convertedIntervals); + } + + /** + * Converts an interval in SimpleInterval format into an htsjdk QueryInterval. + * + *

In doing so, a header lookup is performed to convert from contig name to index + * + * @param interval interval to convert + * @param sequenceDictionary sequence dictionary used to perform the conversion + * @return an equivalent interval in QueryInterval format + */ + private static QueryInterval convertSimpleIntervalToQueryInterval( + final Interval interval, final SAMSequenceDictionary sequenceDictionary) { + if (interval == null) { + throw new IllegalArgumentException("interval may not be null"); + } + if (sequenceDictionary == null) { + throw new IllegalArgumentException("sequence dictionary may not be null"); + } + + final int contigIndex = sequenceDictionary.getSequenceIndex(interval.getContig()); + if (contigIndex == -1) { + throw new IllegalArgumentException( + "Contig " + interval.getContig() + " not present in reads sequence " + "dictionary"); + } + + return new QueryInterval(contigIndex, interval.getStart(), interval.getEnd()); + } + + /** Returns a {@link BAMRecordReader} initialized with the parameters. */ + @Override + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { + final RecordReader rr = new BAMRecordReader(); + rr.initialize(split, ctx); + return rr; + } + + /** The splits returned are {@link FileVirtualSplit FileVirtualSplits}. */ + @Override + public List getSplits(JobContext job) throws IOException { + return getSplits(super.getSplits(job), job.getConfiguration()); + } + + public List getSplits(List splits, Configuration cfg) throws IOException { + + final List origSplits = removeIndexFiles(splits); + + // Align the splits so that they don't cross blocks. + + // addIndexedSplits() requires the given splits to be sorted by file + // path, so do so. Although FileInputFormat.getSplits() does, at the time + // of writing this, generate them in that order, we shouldn't rely on it. + Collections.sort( + origSplits, + new Comparator() { + public int compare(InputSplit a, InputSplit b) { + FileSplit fa = (FileSplit) a, fb = (FileSplit) b; + return fa.getPath().compareTo(fb.getPath()); + } + }); + + final List newSplits = new ArrayList(origSplits.size()); + + for (int i = 0; i < origSplits.size(); ) { + try { + i = addIndexedSplits(origSplits, i, newSplits, cfg); + } catch (IOException | ProviderNotFoundException e) { + if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) { + try { + i = addBAISplits(origSplits, i, newSplits, cfg); + } catch (IOException | ProviderNotFoundException e2) { + i = addProbabilisticSplits(origSplits, i, newSplits, cfg); + } + } else { + i = addProbabilisticSplits(origSplits, i, newSplits, cfg); + } + } + } + return filterByInterval(newSplits, cfg); + } + + // Handles all the splits that share the Path of the one at index i, + // returning the next index to be used. + private int addIndexedSplits( + List splits, int i, List newSplits, Configuration cfg) + throws IOException { + final Path file = ((FileSplit) splits.get(i)).getPath(); + List potentialSplits = new ArrayList(); + + final SplittingBAMIndex idx = + new SplittingBAMIndex(file.getFileSystem(cfg).open(getIdxPath(file))); + + int splitsEnd = splits.size(); + for (int j = i; j < splitsEnd; ++j) { + if (!file.equals(((FileSplit) splits.get(j)).getPath())) { + splitsEnd = j; + } + } + + if (idx.size() == 1) { // no alignments, only the file size, so no splits to add + return splitsEnd; + } + + for (int j = i; j < splitsEnd; ++j) { + final FileSplit fileSplit = (FileSplit) splits.get(j); + + final long start = fileSplit.getStart(); + final long end = start + fileSplit.getLength(); + + final Long blockStart = idx.nextAlignment(start); + + // The last split needs to end where the last alignment ends, but the + // index doesn't store that data (whoops); we only know where the last + // alignment begins. Fortunately there's no need to change the index + // format for this: we can just set the end to the maximal length of + // the final BGZF block (0xffff), and then read until BAMRecordCodec + // hits EOF. + Long blockEnd; + if (j == splitsEnd - 1) { + blockEnd = idx.prevAlignment(end) | 0xffff; + } else { + blockEnd = idx.nextAlignment(end); + } + + if (blockStart == null || blockEnd == null) { + logger.warn("Index for {} was not good. Generating probabilistic splits.", file); + return addProbabilisticSplits(splits, i, newSplits, cfg); + } + + potentialSplits.add( + new FileVirtualSplit(file, blockStart, blockEnd, fileSplit.getLocations())); + } + + for (InputSplit s : potentialSplits) { + newSplits.add(s); + } + return splitsEnd; + } + + // Handles all the splits that share the Path of the one at index i, + // returning the next index to be used. + private int addBAISplits( + List splits, int i, List newSplits, Configuration conf) + throws IOException { + final Path path = ((FileSplit) splits.get(i)).getPath(); + FileSystem fs = path.getFileSystem(conf); + int splitsEnd = i; + + try (FSDataInputStream in = fs.open(path)) { + SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); + SAMSequenceDictionary dict = header.getSequenceDictionary(); + + final SeekableStream guesserSin = WrapSeekable.openPath(fs, path); + final BAMSplitGuesser guesser = new BAMSplitGuesser(guesserSin, conf); + + final SeekableStream sin; + if (fs.exists(getBAIPath(path))) { + sin = WrapSeekable.openPath(fs, getBAIPath(path)); + } else { + sin = + WrapSeekable.openPath( + fs, new Path(path.toString().replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix))); + } + final LinearBAMIndex idx = new LinearBAMIndex(sin, dict); + + // searches for the first contig that contains linear bins + // a contig will have no linear bins if there are no reads mapped to that + // contig (e.g., reads were aligned to a whole genome, and then reads from + // only a single contig were selected) + int ctgIdx = -1; + int bin = 0; + LinearIndex linIdx; + int ctgBins; + long lastStart = 0; + do { + ctgIdx++; + linIdx = idx.getLinearIndex(ctgIdx); + ctgBins = linIdx.size(); + } while (ctgBins == 0); + long nextStart = linIdx.get(bin); + + FileVirtualSplit newSplit = null; + boolean lastWasGuessed = false; + + // loop and process all of the splits that share a single .bai + while (splitsEnd < splits.size() && ((FileSplit) (splits.get(splitsEnd))).getPath() == path) { + FileSplit fSplit = (FileSplit) splits.get(splitsEnd); + splitsEnd++; + + if (splitsEnd >= splits.size()) { + break; } - /** - * Only include reads that overlap the given intervals (if specified) and unplaced - * unmapped reads (if true). - * @param conf the Hadoop configuration to set properties on - * @param intervals the intervals to filter by, or null if all reads - * are to be included (in which case traverseUnplacedUnmapped must be - * true) - * @param traverseUnplacedUnmapped whether to included unplaced unampped reads - * @param the {@link Locatable} type - */ - public static void setTraversalParameters(Configuration conf, - List intervals, boolean traverseUnplacedUnmapped) { - if (intervals == null && !traverseUnplacedUnmapped) { - throw new IllegalArgumentException("Traversing mapped reads only is not supported."); - } - conf.setBoolean(BOUNDED_TRAVERSAL_PROPERTY, true); - if (intervals != null) { - StringBuilder sb = new StringBuilder(); - for (Iterator it = intervals.iterator(); it.hasNext(); ) { - Locatable l = it.next(); - sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd())); - if (it.hasNext()) { - sb.append(","); - } - } - conf.set(INTERVALS_PROPERTY, sb.toString()); - } - conf.setBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, traverseUnplacedUnmapped); - } - - /** - * Reset traversal parameters so that all reads are included. - * @param conf the Hadoop configuration to set properties on - */ - public static void unsetTraversalParameters(Configuration conf) { - conf.unset(BOUNDED_TRAVERSAL_PROPERTY); - conf.unset(INTERVALS_PROPERTY); - conf.unset(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY); - } - - static boolean isBoundedTraversal(Configuration conf) { - return conf.getBoolean(BOUNDED_TRAVERSAL_PROPERTY, false) || - conf.get(INTERVALS_PROPERTY) != null; // backwards compatibility - } - - static boolean traverseUnplacedUnmapped(Configuration conf) { - return conf.getBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, false); - } - - static List getIntervals(Configuration conf) { - return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY); - } - - static Path getIdxPath(Path path) { - return path.suffix(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); - } - - static List removeIndexFiles(List splits) { - // Remove any splitting bai files - return splits.stream() - .filter(split -> !((FileSplit) split).getPath().getName().endsWith( - SplittingBAMIndexer.OUTPUT_FILE_EXTENSION)) - .filter(split -> !((FileSplit) split).getPath().getName().endsWith( - BAMIndex.BAMIndexSuffix)) - .collect(Collectors.toList()); + long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16; + lastStart = nextStart; + + // we need to advance and find the first linear index bin + // that starts after the current split ends. + // this is the end of our split. + while (nextStart < fSplitEnd && ctgIdx < dict.size()) { + + // are we going off of the end of this contig? + // if so, advance to the next contig with a linear bin + if (bin + 1 >= ctgBins) { + do { + ctgIdx += 1; + bin = 0; + if (ctgIdx >= dict.size()) { + break; + } + linIdx = idx.getLinearIndex(ctgIdx); + ctgBins = linIdx.size(); + } while (ctgBins == 0); + } + if (ctgIdx < dict.size() && linIdx.size() > bin) { + nextStart = linIdx.get(bin); + bin++; + } } - - static Path getBAIPath(Path path) { - return path.suffix(BAMIndex.BAMIndexSuffix); - } - - /** Returns a {@link BAMRecordReader} initialized with the parameters. */ - @Override public RecordReader - createRecordReader(InputSplit split, TaskAttemptContext ctx) - throws InterruptedException, IOException - { - final RecordReader rr = - new BAMRecordReader(); - rr.initialize(split, ctx); - return rr; - } - - /** The splits returned are {@link FileVirtualSplit FileVirtualSplits}. */ - @Override public List getSplits(JobContext job) - throws IOException - { - return getSplits(super.getSplits(job), job.getConfiguration()); - } - - public List getSplits( - List splits, Configuration cfg) - throws IOException - { - - final List origSplits = removeIndexFiles(splits); - - // Align the splits so that they don't cross blocks. - - // addIndexedSplits() requires the given splits to be sorted by file - // path, so do so. Although FileInputFormat.getSplits() does, at the time - // of writing this, generate them in that order, we shouldn't rely on it. - Collections.sort(origSplits, new Comparator() { - public int compare(InputSplit a, InputSplit b) { - FileSplit fa = (FileSplit)a, fb = (FileSplit)b; - return fa.getPath().compareTo(fb.getPath()); - } - }); - - final List newSplits = - new ArrayList(origSplits.size()); - - for (int i = 0; i < origSplits.size();) { - try { - i = addIndexedSplits (origSplits, i, newSplits, cfg); - } catch (IOException | ProviderNotFoundException e) { - if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) { - try { - i = addBAISplits (origSplits, i, newSplits, cfg); - } catch (IOException | ProviderNotFoundException e2) { - i = addProbabilisticSplits (origSplits, i, newSplits, cfg); - } - } else { - i = addProbabilisticSplits (origSplits, i, newSplits, cfg); - } - } - } - return filterByInterval(newSplits, cfg); - } - - // Handles all the splits that share the Path of the one at index i, - // returning the next index to be used. - private int addIndexedSplits( - List splits, int i, List newSplits, - Configuration cfg) - throws IOException - { - final Path file = ((FileSplit)splits.get(i)).getPath(); - List potentialSplits = new ArrayList(); - - final SplittingBAMIndex idx = new SplittingBAMIndex( - file.getFileSystem(cfg).open(getIdxPath(file))); - - int splitsEnd = splits.size(); - for (int j = i; j < splitsEnd; ++j) - if (!file.equals(((FileSplit)splits.get(j)).getPath())) - splitsEnd = j; - - if (idx.size() == 1) { // no alignments, only the file size, so no splits to add - return splitsEnd; - } - - for (int j = i; j < splitsEnd; ++j) { - final FileSplit fileSplit = (FileSplit)splits.get(j); - - final long start = fileSplit.getStart(); - final long end = start + fileSplit.getLength(); - - final Long blockStart = idx.nextAlignment(start); - - // The last split needs to end where the last alignment ends, but the - // index doesn't store that data (whoops); we only know where the last - // alignment begins. Fortunately there's no need to change the index - // format for this: we can just set the end to the maximal length of - // the final BGZF block (0xffff), and then read until BAMRecordCodec - // hits EOF. - Long blockEnd; - if (j == splitsEnd - 1) { - blockEnd = idx.prevAlignment(end) | 0xffff; - } else { - blockEnd = idx.nextAlignment(end); - } - - if (blockStart == null || blockEnd == null) { - logger.warn("Index for {} was not good. Generating probabilistic splits.", file); - return addProbabilisticSplits(splits, i, newSplits, cfg); - } - - potentialSplits.add(new FileVirtualSplit( - file, blockStart, blockEnd, fileSplit.getLocations())); - } - - for (InputSplit s : potentialSplits) { - newSplits.add(s); - } - return splitsEnd; - } - - // Handles all the splits that share the Path of the one at index i, - // returning the next index to be used. - private int addBAISplits(List splits, - int i, - List newSplits, - Configuration conf) throws IOException { - final Path path = ((FileSplit)splits.get(i)).getPath(); - FileSystem fs = path.getFileSystem(conf); - int splitsEnd = i; - - try (FSDataInputStream in = fs.open(path)) { - SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); - SAMSequenceDictionary dict = header.getSequenceDictionary(); - - final SeekableStream guesserSin = - WrapSeekable.openPath(fs, path); - final BAMSplitGuesser guesser = new BAMSplitGuesser(guesserSin, conf); - - final SeekableStream sin; - if (fs.exists(getBAIPath(path))) { - sin = WrapSeekable.openPath(fs, getBAIPath(path)); - } else { - sin = WrapSeekable.openPath(fs, new Path(path.toString() - .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix))); - } - final LinearBAMIndex idx = new LinearBAMIndex(sin, dict); - - // searches for the first contig that contains linear bins - // a contig will have no linear bins if there are no reads mapped to that - // contig (e.g., reads were aligned to a whole genome, and then reads from - // only a single contig were selected) - int ctgIdx = -1; - int bin = 0; - LinearIndex linIdx; - int ctgBins; - long lastStart = 0; - do { - ctgIdx++; - linIdx = idx.getLinearIndex(ctgIdx); - ctgBins = linIdx.size(); - } while(ctgBins == 0); - long nextStart = linIdx.get(bin); - - FileVirtualSplit newSplit = null; - boolean lastWasGuessed = false; - - // loop and process all of the splits that share a single .bai - while(splitsEnd < splits.size() && - ((FileSplit)(splits.get(splitsEnd))).getPath() == path) { - FileSplit fSplit = (FileSplit)splits.get(splitsEnd); - splitsEnd++; - - if (splitsEnd >= splits.size()) { - break; - } - - long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16; - lastStart = nextStart; - - // we need to advance and find the first linear index bin - // that starts after the current split ends. - // this is the end of our split. - while(nextStart < fSplitEnd && ctgIdx < dict.size()) { - - // are we going off of the end of this contig? - // if so, advance to the next contig with a linear bin - if (bin + 1 >= ctgBins) { - do { - ctgIdx += 1; - bin = 0; - if (ctgIdx >= dict.size()) { - break; - } - linIdx = idx.getLinearIndex(ctgIdx); - ctgBins = linIdx.size(); - } while (ctgBins == 0); - } - if (ctgIdx < dict.size() && linIdx.size() > bin) { - nextStart = linIdx.get(bin); - bin++; - } - } - - // is this the first split? - // if so, split ranges from where the reads start until the identified end - if (fSplit.getStart() == 0) { - final SeekableStream inFile = - WrapSeekable.openPath(path.getFileSystem(conf), path); - SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false) - .open(SamInputResource.of(inFile)); - SAMFileSpan span = open.indexing().getFilePointerSpanningReads(); - long bamStart = ((BAMFileSpan) span).getFirstOffset(); - newSplit = new FileVirtualSplit(fSplit.getPath(), - bamStart, - nextStart - 1, - fSplit.getLocations()); - newSplits.add(newSplit); - } else { - - // did we find any blocks that started in the last split? - // if yes, then we're fine - // if no, then we need to guess a split start (in the else clause) - if (lastStart != nextStart) { - if (lastWasGuessed) { - newSplit.setEndVirtualOffset(lastStart - 1); - lastWasGuessed = false; - } - newSplit = new FileVirtualSplit(fSplit.getPath(), - lastStart, - nextStart - 1, - fSplit.getLocations()); - newSplits.add(newSplit); - } else { - // guess the start - long alignedBeg = guesser.guessNextBAMRecordStart(fSplit.getStart(), - fSplit.getStart() + fSplit.getLength()); - newSplit.setEndVirtualOffset(alignedBeg - 1); - lastStart = alignedBeg; - nextStart = alignedBeg; - newSplit = new FileVirtualSplit(fSplit.getPath(), - alignedBeg, - alignedBeg + 1, - fSplit.getLocations()); - lastWasGuessed = true; - newSplits.add(newSplit); - } - } - lastStart = nextStart; - } - // clean up the last split - if (splitsEnd == splits.size()) { - if (lastWasGuessed) { - newSplit.setEndVirtualOffset(lastStart - 1); - lastWasGuessed = false; - } - FileSplit fSplit = (FileSplit)splits.get(splitsEnd - 1); - long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16; - newSplit = new FileVirtualSplit(fSplit.getPath(), - lastStart, - fSplitEnd, - fSplit.getLocations()); - newSplits.add(newSplit); - } - } - return splitsEnd + 1; + + // is this the first split? + // if so, split ranges from where the reads start until the identified end + if (fSplit.getStart() == 0) { + final SeekableStream inFile = WrapSeekable.openPath(path.getFileSystem(conf), path); + SamReader open = + SamReaderFactory.makeDefault().setUseAsyncIo(false).open(SamInputResource.of(inFile)); + SAMFileSpan span = open.indexing().getFilePointerSpanningReads(); + long bamStart = ((BAMFileSpan) span).getFirstOffset(); + newSplit = + new FileVirtualSplit( + fSplit.getPath(), bamStart, nextStart - 1, fSplit.getLocations()); + newSplits.add(newSplit); + } else { + + // did we find any blocks that started in the last split? + // if yes, then we're fine + // if no, then we need to guess a split start (in the else clause) + if (lastStart != nextStart) { + if (lastWasGuessed) { + newSplit.setEndVirtualOffset(lastStart - 1); + lastWasGuessed = false; + } + newSplit = + new FileVirtualSplit( + fSplit.getPath(), lastStart, nextStart - 1, fSplit.getLocations()); + newSplits.add(newSplit); + } else { + // guess the start + long alignedBeg = + guesser.guessNextBAMRecordStart( + fSplit.getStart(), fSplit.getStart() + fSplit.getLength()); + newSplit.setEndVirtualOffset(alignedBeg - 1); + lastStart = alignedBeg; + nextStart = alignedBeg; + newSplit = + new FileVirtualSplit( + fSplit.getPath(), alignedBeg, alignedBeg + 1, fSplit.getLocations()); + lastWasGuessed = true; + newSplits.add(newSplit); + } + } + lastStart = nextStart; + } + // clean up the last split + if (splitsEnd == splits.size()) { + if (lastWasGuessed) { + newSplit.setEndVirtualOffset(lastStart - 1); + lastWasGuessed = false; + } + FileSplit fSplit = (FileSplit) splits.get(splitsEnd - 1); + long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16; + newSplit = + new FileVirtualSplit(fSplit.getPath(), lastStart, fSplitEnd, fSplit.getLocations()); + newSplits.add(newSplit); + } + } + return splitsEnd + 1; + } + + // Works the same way as addIndexedSplits, to avoid having to reopen the + // file repeatedly and checking addIndexedSplits for an index repeatedly. + private int addProbabilisticSplits( + List splits, int i, List newSplits, Configuration cfg) + throws IOException { + final Path path = ((FileSplit) splits.get(i)).getPath(); + final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path); + + final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg); + + FileVirtualSplit previousSplit = null; + + for (; i < splits.size(); ++i) { + FileSplit fspl = (FileSplit) splits.get(i); + if (!fspl.getPath().equals(path)) { + break; + } + + long beg = fspl.getStart(); + long end = beg + fspl.getLength(); + + long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); + + // As the guesser goes to the next BGZF block before looking for BAM + // records, the ending BGZF blocks have to always be traversed fully. + // Hence force the length to be 0xffff, the maximum possible. + long alignedEnd = end << 16 | 0xffff; + + if (alignedBeg == end) { + // No records detected in this split: merge it to the previous one. + // This could legitimately happen e.g. if we have a split that is + // so small that it only contains the middle part of a BGZF block. + // + // Of course, if it's the first split, then this is simply not a + // valid BAM file. + // + // FIXME: In theory, any number of splits could only contain parts + // of the BAM header before we start to see splits that contain BAM + // records. For now, we require that the split size is at least as + // big as the header and don't handle that case. + if (previousSplit == null) { + throw new IOException( + "'" + path + "': " + "no reads in first split: bad BAM file or tiny split size?"); + } + + previousSplit.setEndVirtualOffset(alignedEnd); + } else { + previousSplit = new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()); + if (logger.isDebugEnabled()) { + final long byteOffset = alignedBeg >>> 16; + final long recordOffset = alignedBeg & 0xffff; + logger.debug( + "Split {}: byte offset: {} record offset: {}, virtual offset: {}", + i, + byteOffset, + recordOffset, + alignedBeg); + } + newSplits.add(previousSplit); + } + } + + sin.close(); + return i; + } + + private List filterByInterval(List splits, Configuration conf) + throws IOException { + if (!isBoundedTraversal(conf)) { + return splits; + } + + // Get the chunk lists (BAMFileSpans) in the intervals we want (chunks give start + // and end file pointers into a BAM file) by looking in all the indexes for the BAM + // files + Set bamFiles = new LinkedHashSet<>(); + for (InputSplit split : splits) { + bamFiles.add(((FileVirtualSplit) split).getPath()); + } + Map fileToSpan = new LinkedHashMap<>(); + SamReaderFactory readerFactory = + SamReaderFactory.makeDefault() + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setUseAsyncIo(false); + + List intervals = getIntervals(conf); + + Map fileToUnmapped = new LinkedHashMap<>(); + boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf); + + for (Path bamFile : bamFiles) { + FileSystem fs = bamFile.getFileSystem(conf); + + try (SamReader samReader = + readerFactory.open(NIOFileUtil.asPath(fs.makeQualified(bamFile).toUri()))) { + if (!samReader.hasIndex()) { + throw new IllegalArgumentException( + "Intervals set but no BAM index file found for " + bamFile); } - - // Works the same way as addIndexedSplits, to avoid having to reopen the - // file repeatedly and checking addIndexedSplits for an index repeatedly. - private int addProbabilisticSplits( - List splits, int i, List newSplits, - Configuration cfg) - throws IOException - { - final Path path = ((FileSplit)splits.get(i)).getPath(); - final SeekableStream sin = - WrapSeekable.openPath(path.getFileSystem(cfg), path); - - final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg); - - FileVirtualSplit previousSplit = null; - - for (; i < splits.size(); ++i) { - FileSplit fspl = (FileSplit)splits.get(i); - if (!fspl.getPath().equals(path)) - break; - - long beg = fspl.getStart(); - long end = beg + fspl.getLength(); - - long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); - - // As the guesser goes to the next BGZF block before looking for BAM - // records, the ending BGZF blocks have to always be traversed fully. - // Hence force the length to be 0xffff, the maximum possible. - long alignedEnd = end << 16 | 0xffff; - - if (alignedBeg == end) { - // No records detected in this split: merge it to the previous one. - // This could legitimately happen e.g. if we have a split that is - // so small that it only contains the middle part of a BGZF block. - // - // Of course, if it's the first split, then this is simply not a - // valid BAM file. - // - // FIXME: In theory, any number of splits could only contain parts - // of the BAM header before we start to see splits that contain BAM - // records. For now, we require that the split size is at least as - // big as the header and don't handle that case. - if (previousSplit == null) - throw new IOException("'" + path + "': "+ - "no reads in first split: bad BAM file or tiny split size?"); - - previousSplit.setEndVirtualOffset(alignedEnd); - } else { - previousSplit = new FileVirtualSplit( - path, alignedBeg, alignedEnd, fspl.getLocations()); - if (logger.isDebugEnabled()) { - final long byteOffset = alignedBeg >>> 16; - final long recordOffset = alignedBeg & 0xffff; - logger.debug( - "Split {}: byte offset: {} record offset: {}, virtual offset: {}", - i, byteOffset, recordOffset, alignedBeg); - } - newSplits.add(previousSplit); - } - } - - sin.close(); - return i; - } - - private List filterByInterval(List splits, Configuration conf) - throws IOException { - if (!isBoundedTraversal(conf)) { - return splits; - } - - // Get the chunk lists (BAMFileSpans) in the intervals we want (chunks give start - // and end file pointers into a BAM file) by looking in all the indexes for the BAM - // files - Set bamFiles = new LinkedHashSet<>(); - for (InputSplit split : splits) { - bamFiles.add(((FileVirtualSplit) split).getPath()); - } - Map fileToSpan = new LinkedHashMap<>(); - SamReaderFactory readerFactory = SamReaderFactory.makeDefault() - .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true) - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) - .setUseAsyncIo(false); - - List intervals = getIntervals(conf); - - Map fileToUnmapped = new LinkedHashMap<>(); - boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf); - - for (Path bamFile : bamFiles) { - FileSystem fs = bamFile.getFileSystem(conf); - - try (SamReader samReader = - readerFactory.open(NIOFileUtil.asPath(fs.makeQualified(bamFile).toUri()))) { - if (!samReader.hasIndex()) { - throw new IllegalArgumentException("Intervals set but no BAM index file found for " + bamFile); - - } - - try (FSDataInputStream in = fs.open(bamFile)) { - SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); - SAMSequenceDictionary dict = header.getSequenceDictionary(); - BAMIndex idx = samReader.indexing().getIndex(); - - if (intervals != null && !intervals.isEmpty()) { - QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict); - fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx)); - } - - if (traverseUnplacedUnmapped) { - long startOfLastLinearBin = idx.getStartOfLastLinearBin(); - long noCoordinateCount = ((AbstractBAMFileIndex) idx).getNoCoordinateCount(); - if (startOfLastLinearBin != -1 && noCoordinateCount > 0) { - // add FileVirtualSplit (with no intervals) from startOfLastLinearBin to - // end of file - fileToUnmapped.put(bamFile, startOfLastLinearBin); - } - } - } - - } - } - - // Use the chunks to filter the splits - List filteredSplits = new ArrayList<>(); - for (InputSplit split : splits) { - FileVirtualSplit virtualSplit = (FileVirtualSplit) split; - long splitStart = virtualSplit.getStartVirtualOffset(); - long splitEnd = virtualSplit.getEndVirtualOffset(); - BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(splitStart, splitEnd)); - BAMFileSpan span = fileToSpan.get(virtualSplit.getPath()); - if (span == null) { - continue; - } - span = (BAMFileSpan) span.removeContentsBefore(splitSpan); - span = (BAMFileSpan) span.removeContentsAfter(splitSpan); - if (!span.getChunks().isEmpty()) { - filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd, - virtualSplit.getLocations(), span.toCoordinateArray())); - } - } - - if (traverseUnplacedUnmapped) { - // add extra splits that contain only unmapped reads - for (Map.Entry e : fileToUnmapped.entrySet()) { - Path file = e.getKey(); - long unmappedStart = e.getValue(); - boolean foundFirstSplit = false; - for (InputSplit split : splits) { // TODO: are splits in order of start position? - FileVirtualSplit virtualSplit = (FileVirtualSplit) split; - if (virtualSplit.getPath().equals(file)) { - long splitStart = virtualSplit.getStartVirtualOffset(); - long splitEnd = virtualSplit.getEndVirtualOffset(); - if (foundFirstSplit) { - filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd, - virtualSplit.getLocations())); - } else if (splitStart <= unmappedStart && unmappedStart <= splitEnd) { - filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), unmappedStart, splitEnd, - virtualSplit.getLocations())); - foundFirstSplit = true; - } - } - } - } - } - - return filteredSplits; - } - - /** - * Converts a List of SimpleIntervals into the format required by the SamReader query API - * @param rawIntervals SimpleIntervals to be converted - * @return A sorted, merged list of QueryIntervals suitable for passing to the SamReader query API - */ - static QueryInterval[] prepareQueryIntervals( final List - rawIntervals, final SAMSequenceDictionary sequenceDictionary ) { - if ( rawIntervals == null || rawIntervals.isEmpty() ) { - return null; - } - - // Convert each SimpleInterval to a QueryInterval - final QueryInterval[] convertedIntervals = - rawIntervals.stream() - .map(rawInterval -> convertSimpleIntervalToQueryInterval(rawInterval, sequenceDictionary)) - .toArray(QueryInterval[]::new); - - // Intervals must be optimized (sorted and merged) in order to use the htsjdk query API - return QueryInterval.optimizeIntervals(convertedIntervals); - } - /** - * Converts an interval in SimpleInterval format into an htsjdk QueryInterval. - * - * In doing so, a header lookup is performed to convert from contig name to index - * - * @param interval interval to convert - * @param sequenceDictionary sequence dictionary used to perform the conversion - * @return an equivalent interval in QueryInterval format - */ - private static QueryInterval convertSimpleIntervalToQueryInterval( final Interval interval, final SAMSequenceDictionary sequenceDictionary ) { - if (interval == null) { - throw new IllegalArgumentException("interval may not be null"); - } - if (sequenceDictionary == null) { - throw new IllegalArgumentException("sequence dictionary may not be null"); - } - - final int contigIndex = sequenceDictionary.getSequenceIndex(interval.getContig()); - if ( contigIndex == -1 ) { - throw new IllegalArgumentException("Contig " + interval.getContig() + " not present in reads sequence " + - "dictionary"); - } - - return new QueryInterval(contigIndex, interval.getStart(), interval.getEnd()); - } - - @Override public boolean isSplitable(JobContext job, Path path) { - return true; - } + + try (FSDataInputStream in = fs.open(bamFile)) { + SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); + SAMSequenceDictionary dict = header.getSequenceDictionary(); + BAMIndex idx = samReader.indexing().getIndex(); + + if (intervals != null && !intervals.isEmpty()) { + QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict); + fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx)); + } + + if (traverseUnplacedUnmapped) { + long startOfLastLinearBin = idx.getStartOfLastLinearBin(); + long noCoordinateCount = ((AbstractBAMFileIndex) idx).getNoCoordinateCount(); + if (startOfLastLinearBin != -1 && noCoordinateCount > 0) { + // add FileVirtualSplit (with no intervals) from startOfLastLinearBin to + // end of file + fileToUnmapped.put(bamFile, startOfLastLinearBin); + } + } + } + } + } + + // Use the chunks to filter the splits + List filteredSplits = new ArrayList<>(); + for (InputSplit split : splits) { + FileVirtualSplit virtualSplit = (FileVirtualSplit) split; + long splitStart = virtualSplit.getStartVirtualOffset(); + long splitEnd = virtualSplit.getEndVirtualOffset(); + BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(splitStart, splitEnd)); + BAMFileSpan span = fileToSpan.get(virtualSplit.getPath()); + if (span == null) { + continue; + } + span = (BAMFileSpan) span.removeContentsBefore(splitSpan); + span = (BAMFileSpan) span.removeContentsAfter(splitSpan); + if (!span.getChunks().isEmpty()) { + filteredSplits.add( + new FileVirtualSplit( + virtualSplit.getPath(), + splitStart, + splitEnd, + virtualSplit.getLocations(), + span.toCoordinateArray())); + } + } + + if (traverseUnplacedUnmapped) { + // add extra splits that contain only unmapped reads + for (Map.Entry e : fileToUnmapped.entrySet()) { + Path file = e.getKey(); + long unmappedStart = e.getValue(); + boolean foundFirstSplit = false; + for (InputSplit split : splits) { // TODO: are splits in order of start position? + FileVirtualSplit virtualSplit = (FileVirtualSplit) split; + if (virtualSplit.getPath().equals(file)) { + long splitStart = virtualSplit.getStartVirtualOffset(); + long splitEnd = virtualSplit.getEndVirtualOffset(); + if (foundFirstSplit) { + filteredSplits.add( + new FileVirtualSplit( + virtualSplit.getPath(), splitStart, splitEnd, virtualSplit.getLocations())); + } else if (splitStart <= unmappedStart && unmappedStart <= splitEnd) { + filteredSplits.add( + new FileVirtualSplit( + virtualSplit.getPath(), + unmappedStart, + splitEnd, + virtualSplit.getLocations())); + foundFirstSplit = true; + } + } + } + } + } + + return filteredSplits; + } + + @Override + public boolean isSplitable(JobContext job, Path path) { + return true; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java index 951f910..4002dae 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BAMOutputFormat.java @@ -24,18 +24,17 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -/** Currently this only locks down the value type of the {@link +/** + * Currently this only locks down the value type of the {@link * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality. */ -public abstract class BAMOutputFormat - extends FileOutputFormat { - /** - * If set to true, write .splitting-bai files for every BAM file - * (defaults to false). - * A splitting BAI file (not to be confused with a regular BAI file) contains an - * index of offsets that the BAM file can be read from; they are used by - * {@link BAMInputFormat} to construct splits. - */ - public static final String WRITE_SPLITTING_BAI = - "hadoopbam.bam.write-splitting-bai"; +public abstract class BAMOutputFormat extends FileOutputFormat { + + /** + * If set to true, write .splitting-bai files for every BAM file (defaults to + * false). A splitting BAI file (not to be confused with a regular BAI file) contains + * an index of offsets that the BAM file can be read from; they are used by {@link BAMInputFormat} + * to construct splits. + */ + public static final String WRITE_SPLITTING_BAI = "hadoopbam.bam.write-splitting-bai"; } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BAMRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/BAMRecordReader.java index 8767a4c..4c1ab7c 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BAMRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BAMRecordReader.java @@ -26,17 +26,18 @@ import htsjdk.samtools.BAMFileSpan; import htsjdk.samtools.Chunk; import htsjdk.samtools.QueryInterval; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamFiles; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.Interval; import java.io.IOException; -import java.nio.file.Paths; import java.util.List; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -45,11 +46,6 @@ import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import htsjdk.samtools.ValidationStringency; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; - import org.seqdoop.hadoop_bam.util.MurmurHash3; import org.seqdoop.hadoop_bam.util.NIOFileUtil; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; @@ -57,177 +53,195 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** The key is the bitwise OR of the reference sequence ID in the upper 32 bits - * and the 0-based leftmost coordinate in the lower. +/** + * The key is the bitwise OR of the reference sequence ID in the upper 32 bits and the 0-based + * leftmost coordinate in the lower. */ -public class BAMRecordReader - extends RecordReader -{ - private static final Logger logger = LoggerFactory.getLogger(BAMRecordReader.class); - private final LongWritable key = new LongWritable(); - private final SAMRecordWritable record = new SAMRecordWritable(); - private BAMFileReader bamFileReader; - - private CloseableIterator iterator; - private boolean reachedEnd; - private WrapSeekable in; - private long fileStart; - private long virtualEnd; - private boolean isInitialized = false; - - /** Note: this is the only getKey function that handles unmapped reads - * specially! - */ - public static long getKey(final SAMRecord rec) { - final int refIdx = rec.getReferenceIndex(); - final int start = rec.getAlignmentStart(); - - if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) - return getKey(refIdx, start); - - // Put unmapped reads at the end, but don't give them all the exact same - // key so that they can be distributed to different reducers. - // - // A random number would probably be best, but to ensure that the same - // record always gets the same key we use a fast hash instead. - // - // We avoid using hashCode(), because it's not guaranteed to have the - // same value across different processes. - - int hash = 0; - byte[] var; - if ((var = rec.getVariableBinaryRepresentation()) != null) { - // Undecoded BAM record: just hash its raw data. - hash = (int)MurmurHash3.murmurhash3(var, hash); - } else { - // Decoded BAM record or any SAM record: hash a few representative - // fields together. - hash = (int)MurmurHash3.murmurhash3(rec.getReadName(), hash); - hash = (int)MurmurHash3.murmurhash3(rec.getReadBases(), hash); - hash = (int)MurmurHash3.murmurhash3(rec.getBaseQualities(), hash); - hash = (int)MurmurHash3.murmurhash3(rec.getCigarString(), hash); - } - return getKey0(Integer.MAX_VALUE, hash); - } - - /** @param alignmentStart 1-based leftmost coordinate. */ - public static long getKey(int refIdx, int alignmentStart) { - return getKey0(refIdx, alignmentStart-1); - } - - /** @param alignmentStart0 0-based leftmost coordinate. */ - public static long getKey0(int refIdx, int alignmentStart0) { - return (long)refIdx << 32 | alignmentStart0; - } - - @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) - throws IOException - { - // This method should only be called once (see Hadoop API). However, - // there seems to be disagreement between implementations that call - // initialize() and Hadoop-BAM's own code that relies on - // {@link BAMInputFormat} to call initialize() when the reader is - // created. Therefore we add this check for the time being. - if(isInitialized) - close(); - isInitialized = true; - reachedEnd = false; - - final Configuration conf = ctx.getConfiguration(); - - final FileVirtualSplit split = (FileVirtualSplit)spl; - final Path file = split.getPath(); - final FileSystem fs = file.getFileSystem(conf); - - ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf); - - java.nio.file.Path index = SamFiles.findIndex(NIOFileUtil.asPath(fs.makeQualified(file).toUri())); - Path fileIndex = index == null ? null : new Path(index.toUri()); - SeekableStream indexStream = fileIndex == null ? null : WrapSeekable.openPath(fs, fileIndex); - in = WrapSeekable.openPath(fs, file); - SamReader samReader = createSamReader(in, indexStream, stringency); - final SAMFileHeader header = samReader.getFileHeader(); - - long virtualStart = split.getStartVirtualOffset(); - - fileStart = virtualStart >>> 16; - virtualEnd = split.getEndVirtualOffset(); - - SamReader.PrimitiveSamReader primitiveSamReader = - ((SamReader.PrimitiveSamReaderToSamReaderAdapter) samReader).underlyingReader(); - bamFileReader = (BAMFileReader) primitiveSamReader; - - if (logger.isDebugEnabled()) { - final long recordStart = virtualStart & 0xffff; - logger.debug("Initialized BAMRecordReader; byte offset: {}, record offset: {}", - fileStart, recordStart); - } - - if (conf.getBoolean("hadoopbam.bam.keep-paired-reads-together", false)) { - throw new IllegalArgumentException("Property hadoopbam.bam.keep-paired-reads-together is no longer honored."); - } - - boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf); - if (boundedTraversal && split.getIntervalFilePointers() != null) { - // return reads for intervals - List intervals = BAMInputFormat.getIntervals(conf); - QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary()); - iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers()); - } else if (boundedTraversal && split.getIntervalFilePointers() == null) { - // return unmapped reads - iterator = bamFileReader.queryUnmapped(); - } else { - // return everything - BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(virtualStart, virtualEnd)); - iterator = bamFileReader.getIterator(splitSpan); - } - } - - private SamReader createSamReader(SeekableStream in, SeekableStream inIndex, - ValidationStringency stringency) { - SamReaderFactory readerFactory = SamReaderFactory.makeDefault() - .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true) - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) - .setUseAsyncIo(false); - if (stringency != null) { - readerFactory.validationStringency(stringency); - } - SamInputResource resource = SamInputResource.of(in); - if (inIndex != null) { - resource.index(inIndex); - } - return readerFactory.open(resource); - } - - @Override public void close() throws IOException { - bamFileReader.close(); - } - - /** Unless the end has been reached, this only takes file position into - * account, not the position within the block. - */ - @Override public float getProgress() throws IOException { - if (reachedEnd) - return 1; - else { - final long filePos = in.position(); - final long fileEnd = virtualEnd >>> 16; - // Add 1 to the denominator to make sure it doesn't reach 1 here when - // filePos == fileEnd. - return (float)(filePos - fileStart) / (fileEnd - fileStart + 1); - } - } - @Override public LongWritable getCurrentKey () { return key; } - @Override public SAMRecordWritable getCurrentValue() { return record; } - - @Override public boolean nextKeyValue() { - if (!iterator.hasNext()) { - reachedEnd = true; - return false; - } - final SAMRecord r = iterator.next(); - key.set(getKey(r)); - record.set(r); - return true; - } +public class BAMRecordReader extends RecordReader { + + private static final Logger logger = LoggerFactory.getLogger(BAMRecordReader.class); + private final LongWritable key = new LongWritable(); + private final SAMRecordWritable record = new SAMRecordWritable(); + private BAMFileReader bamFileReader; + + private CloseableIterator iterator; + private boolean reachedEnd; + private WrapSeekable in; + private long fileStart; + private long virtualEnd; + private boolean isInitialized = false; + + /** Note: this is the only getKey function that handles unmapped reads specially! */ + public static long getKey(final SAMRecord rec) { + final int refIdx = rec.getReferenceIndex(); + final int start = rec.getAlignmentStart(); + + if (!(rec.getReadUnmappedFlag() || refIdx < 0 || start < 0)) { + return getKey(refIdx, start); + } + + // Put unmapped reads at the end, but don't give them all the exact same + // key so that they can be distributed to different reducers. + // + // A random number would probably be best, but to ensure that the same + // record always gets the same key we use a fast hash instead. + // + // We avoid using hashCode(), because it's not guaranteed to have the + // same value across different processes. + + int hash = 0; + byte[] var; + if ((var = rec.getVariableBinaryRepresentation()) != null) { + // Undecoded BAM record: just hash its raw data. + hash = (int) MurmurHash3.murmurhash3(var, hash); + } else { + // Decoded BAM record or any SAM record: hash a few representative + // fields together. + hash = (int) MurmurHash3.murmurhash3(rec.getReadName(), hash); + hash = (int) MurmurHash3.murmurhash3(rec.getReadBases(), hash); + hash = (int) MurmurHash3.murmurhash3(rec.getBaseQualities(), hash); + hash = (int) MurmurHash3.murmurhash3(rec.getCigarString(), hash); + } + return getKey0(Integer.MAX_VALUE, hash); + } + + /** @param alignmentStart 1-based leftmost coordinate. */ + public static long getKey(int refIdx, int alignmentStart) { + return getKey0(refIdx, alignmentStart - 1); + } + + /** @param alignmentStart0 0-based leftmost coordinate. */ + public static long getKey0(int refIdx, int alignmentStart0) { + return (long) refIdx << 32 | alignmentStart0; + } + + @Override + public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { + // This method should only be called once (see Hadoop API). However, + // there seems to be disagreement between implementations that call + // initialize() and Hadoop-BAM's own code that relies on + // {@link BAMInputFormat} to call initialize() when the reader is + // created. Therefore we add this check for the time being. + if (isInitialized) { + close(); + } + isInitialized = true; + reachedEnd = false; + + final Configuration conf = ctx.getConfiguration(); + + final FileVirtualSplit split = (FileVirtualSplit) spl; + final Path file = split.getPath(); + final FileSystem fs = file.getFileSystem(conf); + + ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf); + + java.nio.file.Path index = + SamFiles.findIndex(NIOFileUtil.asPath(fs.makeQualified(file).toUri())); + Path fileIndex = index == null ? null : new Path(index.toUri()); + SeekableStream indexStream = fileIndex == null ? null : WrapSeekable.openPath(fs, fileIndex); + in = WrapSeekable.openPath(fs, file); + SamReader samReader = createSamReader(in, indexStream, stringency); + final SAMFileHeader header = samReader.getFileHeader(); + + long virtualStart = split.getStartVirtualOffset(); + + fileStart = virtualStart >>> 16; + virtualEnd = split.getEndVirtualOffset(); + + SamReader.PrimitiveSamReader primitiveSamReader = + ((SamReader.PrimitiveSamReaderToSamReaderAdapter) samReader).underlyingReader(); + bamFileReader = (BAMFileReader) primitiveSamReader; + + if (logger.isDebugEnabled()) { + final long recordStart = virtualStart & 0xffff; + logger.debug( + "Initialized BAMRecordReader; byte offset: {}, record offset: {}", + fileStart, + recordStart); + } + + if (conf.getBoolean("hadoopbam.bam.keep-paired-reads-together", false)) { + throw new IllegalArgumentException( + "Property hadoopbam.bam.keep-paired-reads-together is no longer honored."); + } + + boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf); + if (boundedTraversal && split.getIntervalFilePointers() != null) { + // return reads for intervals + List intervals = BAMInputFormat.getIntervals(conf); + QueryInterval[] queryIntervals = + BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary()); + iterator = + bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers()); + } else if (boundedTraversal && split.getIntervalFilePointers() == null) { + // return unmapped reads + iterator = bamFileReader.queryUnmapped(); + } else { + // return everything + BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(virtualStart, virtualEnd)); + iterator = bamFileReader.getIterator(splitSpan); + } + } + + private SamReader createSamReader( + SeekableStream in, SeekableStream inIndex, ValidationStringency stringency) { + SamReaderFactory readerFactory = + SamReaderFactory.makeDefault() + .setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true) + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setUseAsyncIo(false); + if (stringency != null) { + readerFactory.validationStringency(stringency); + } + SamInputResource resource = SamInputResource.of(in); + if (inIndex != null) { + resource.index(inIndex); + } + return readerFactory.open(resource); + } + + @Override + public void close() throws IOException { + bamFileReader.close(); + } + + /** + * Unless the end has been reached, this only takes file position into account, not the position + * within the block. + */ + @Override + public float getProgress() throws IOException { + if (reachedEnd) { + return 1; + } else { + final long filePos = in.position(); + final long fileEnd = virtualEnd >>> 16; + // Add 1 to the denominator to make sure it doesn't reach 1 here when + // filePos == fileEnd. + return (float) (filePos - fileStart) / (fileEnd - fileStart + 1); + } + } + + @Override + public LongWritable getCurrentKey() { + return key; + } + + @Override + public SAMRecordWritable getCurrentValue() { + return record; + } + + @Override + public boolean nextKeyValue() { + if (!iterator.hasNext()) { + reachedEnd = true; + return false; + } + final SAMRecord r = iterator.next(); + key.set(getKey(r)); + record.set(r); + return true; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BAMRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/BAMRecordWriter.java index 5a8f3da..9085d01 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BAMRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BAMRecordWriter.java @@ -22,11 +22,6 @@ package org.seqdoop.hadoop_bam; -import java.io.IOException; -import java.io.OutputStream; -import java.io.StringWriter; -import java.io.Writer; - import htsjdk.samtools.BAMRecordCodec; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; @@ -35,122 +30,113 @@ import htsjdk.samtools.SAMTextHeaderCodec; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.BlockCompressedOutputStream; - +import java.io.IOException; +import java.io.OutputStream; +import java.io.StringWriter; +import java.io.Writer; import java.nio.charset.Charset; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -/** A base {@link RecordWriter} for BAM records. +/** + * A base {@link RecordWriter} for BAM records. * - *

Handles the output stream, writing the header if requested, and provides - * the {@link #writeAlignment} function for subclasses.

+ *

Handles the output stream, writing the header if requested, and provides the {@link + * #writeAlignment} function for subclasses. */ -public abstract class BAMRecordWriter - extends RecordWriter -{ - private OutputStream origOutput; - private BinaryCodec binaryCodec; - private BAMRecordCodec recordCodec; - private BlockCompressedOutputStream compressedOut; - private SplittingBAMIndexer splittingBAMIndexer; - - /** A SAMFileHeader is read from the input Path. */ - public BAMRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - init( - output, - SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), - writeHeader, ctx); - if (ctx.getConfiguration().getBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false)) { - Path splittingIndex = BAMInputFormat.getIdxPath(output); - OutputStream splittingIndexOutput = - output.getFileSystem(ctx.getConfiguration()).create(splittingIndex); - splittingBAMIndexer = new SplittingBAMIndexer(splittingIndexOutput); - } - } - public BAMRecordWriter( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - if (ctx.getConfiguration().getBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false)) { - Path splittingIndex = BAMInputFormat.getIdxPath(output); - OutputStream splittingIndexOutput = - output.getFileSystem(ctx.getConfiguration()).create(splittingIndex); - splittingBAMIndexer = new SplittingBAMIndexer(splittingIndexOutput); - } - } - - // Working around not being able to call a constructor other than as the - // first statement... - private void init( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - } - private void init( - OutputStream output, SAMFileHeader header, boolean writeHeader) - throws IOException - { - origOutput = output; - - compressedOut = new BlockCompressedOutputStream(origOutput, null); - - binaryCodec = new BinaryCodec(compressedOut); - recordCodec = new BAMRecordCodec(header); - recordCodec.setOutputStream(compressedOut); - - if (writeHeader) - this.writeHeader(header); - } - - @Override public void close(TaskAttemptContext ctx) throws IOException { - // Don't close the codec, we don't want BlockCompressedOutputStream's - // file terminator to be output. But do flush the stream. - binaryCodec.getOutputStream().flush(); - - // Finish indexer with file length - if (splittingBAMIndexer != null) { - splittingBAMIndexer.finish(compressedOut.getFilePointer() >> 16); - } - - // And close the original output. - origOutput.close(); - } - - protected void writeAlignment(final SAMRecord rec) throws IOException { - if (splittingBAMIndexer != null) { - splittingBAMIndexer.processAlignment(compressedOut.getFilePointer()); - } - recordCodec.encode(rec); - } - - private void writeHeader(final SAMFileHeader header) { - binaryCodec.writeBytes("BAM\001".getBytes(Charset.forName("UTF8"))); - - final Writer sw = new StringWriter(); - new SAMTextHeaderCodec().encode(sw, header); - - binaryCodec.writeString(sw.toString(), true, false); - - final SAMSequenceDictionary dict = header.getSequenceDictionary(); - - binaryCodec.writeInt(dict.size()); - for (final SAMSequenceRecord rec : dict.getSequences()) { - binaryCodec.writeString(rec.getSequenceName(), true, true); - binaryCodec.writeInt (rec.getSequenceLength()); - } - } +public abstract class BAMRecordWriter extends RecordWriter { + + private OutputStream origOutput; + private BinaryCodec binaryCodec; + private BAMRecordCodec recordCodec; + private BlockCompressedOutputStream compressedOut; + private SplittingBAMIndexer splittingBAMIndexer; + + /** A SAMFileHeader is read from the input Path. */ + public BAMRecordWriter(Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init( + output, SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), writeHeader, ctx); + if (ctx.getConfiguration().getBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false)) { + Path splittingIndex = BAMInputFormat.getIdxPath(output); + OutputStream splittingIndexOutput = + output.getFileSystem(ctx.getConfiguration()).create(splittingIndex); + splittingBAMIndexer = new SplittingBAMIndexer(splittingIndexOutput); + } + } + + public BAMRecordWriter( + Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + if (ctx.getConfiguration().getBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false)) { + Path splittingIndex = BAMInputFormat.getIdxPath(output); + OutputStream splittingIndexOutput = + output.getFileSystem(ctx.getConfiguration()).create(splittingIndex); + splittingBAMIndexer = new SplittingBAMIndexer(splittingIndexOutput); + } + } + + // Working around not being able to call a constructor other than as the + // first statement... + private void init(Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + } + + private void init(OutputStream output, SAMFileHeader header, boolean writeHeader) + throws IOException { + origOutput = output; + + compressedOut = new BlockCompressedOutputStream(origOutput, null); + + binaryCodec = new BinaryCodec(compressedOut); + recordCodec = new BAMRecordCodec(header); + recordCodec.setOutputStream(compressedOut); + + if (writeHeader) { + this.writeHeader(header); + } + } + + @Override + public void close(TaskAttemptContext ctx) throws IOException { + // Don't close the codec, we don't want BlockCompressedOutputStream's + // file terminator to be output. But do flush the stream. + binaryCodec.getOutputStream().flush(); + + // Finish indexer with file length + if (splittingBAMIndexer != null) { + splittingBAMIndexer.finish(compressedOut.getFilePointer() >> 16); + } + + // And close the original output. + origOutput.close(); + } + + protected void writeAlignment(final SAMRecord rec) throws IOException { + if (splittingBAMIndexer != null) { + splittingBAMIndexer.processAlignment(compressedOut.getFilePointer()); + } + recordCodec.encode(rec); + } + + private void writeHeader(final SAMFileHeader header) { + binaryCodec.writeBytes("BAM\001".getBytes(Charset.forName("UTF8"))); + + final Writer sw = new StringWriter(); + new SAMTextHeaderCodec().encode(sw, header); + + binaryCodec.writeString(sw.toString(), true, false); + + final SAMSequenceDictionary dict = header.getSequenceDictionary(); + + binaryCodec.writeInt(dict.size()); + for (final SAMSequenceRecord rec : dict.getSequences()) { + binaryCodec.writeString(rec.getSequenceName(), true, true); + binaryCodec.writeInt(rec.getSequenceLength()); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BAMSplitGuesser.java b/src/main/java/org/seqdoop/hadoop_bam/BAMSplitGuesser.java index 7e1bd7f..91fecc7 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BAMSplitGuesser.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BAMSplitGuesser.java @@ -23,379 +23,373 @@ package org.seqdoop.hadoop_bam; import htsjdk.samtools.BAMFileSpan; +import htsjdk.samtools.BAMRecordCodec; +import htsjdk.samtools.FileTruncatedException; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMFormatException; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordHelper; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.seekablestream.ByteArraySeekableStream; -import java.io.InputStream; +import htsjdk.samtools.seekablestream.SeekableStream; +import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.RuntimeEOFException; +import htsjdk.samtools.util.RuntimeIOException; import java.io.IOException; +import java.io.InputStream; import java.util.Arrays; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.GenericOptionsParser; - -import htsjdk.samtools.BAMRecordCodec; -import htsjdk.samtools.FileTruncatedException; -import htsjdk.samtools.SAMFormatException; -import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.samtools.util.BlockCompressedInputStream; -import htsjdk.samtools.util.RuntimeEOFException; -import htsjdk.samtools.util.RuntimeIOException; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; import org.seqdoop.hadoop_bam.util.WrapSeekable; -/** A class for heuristically finding BAM record positions inside an area of - * a BAM file. - */ +/** A class for heuristically finding BAM record positions inside an area of a BAM file. */ public class BAMSplitGuesser extends BaseSplitGuesser { - private SeekableStream inFile; - private BlockCompressedInputStream bgzf; - private final BAMRecordCodec bamCodec; - private final int referenceSequenceCount; - private final SAMFileHeader header; - - // We want to go through this many BGZF blocks fully, checking that they - // contain valid BAM records, when guessing a BAM record position. - private final static byte BLOCKS_NEEDED_FOR_GUESS = 3; - - // Since the max size of a BGZF block is 0xffff (64K), and we might be just - // one byte off from the start of the previous one, we need 0xfffe bytes for - // the start, and then 0xffff times the number of blocks we want to go - // through. - private final static int MAX_BYTES_READ = - BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe; - - private final static int SHORTEST_POSSIBLE_BAM_RECORD = 4*9 + 1 + 1 + 1; - - /** The stream must point to a valid BAM file, because the header is read - * from it. - */ - public BAMSplitGuesser( - SeekableStream ss, Configuration conf) - throws IOException - { - this(ss, ss, conf); - - // Secondary check that the header points to a BAM file: Picard can get - // things wrong due to its autodetection. - ss.seek(0); - if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC) - throw new SAMFormatException("Does not seem like a BAM file"); - } - - public BAMSplitGuesser( - SeekableStream ss, InputStream headerStream, Configuration conf) - throws IOException - { - inFile = ss; - - header = SAMHeaderReader.readSAMHeaderFrom(headerStream, conf); - referenceSequenceCount = header.getSequenceDictionary().size(); - - bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory()); - } - - /** Finds a virtual BAM record position in the physical position range - * [beg,end). Returns end if no BAM record was found. - */ - public long guessNextBAMRecordStart(long beg, long end) - throws IOException - { - // Use a reader to skip through the headers at the beginning of a BAM file, since - // the headers may exceed MAX_BYTES_READ in length. Don't close the reader - // otherwise it will close the underlying stream, which we continue to read from - // on subsequent calls to this method. - if (beg == 0) { - this.inFile.seek(beg); - SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false) - .open(SamInputResource.of(inFile)); - SAMFileSpan span = open.indexing().getFilePointerSpanningReads(); - if (span instanceof BAMFileSpan) { - return ((BAMFileSpan) span).getFirstOffset(); - } - } - - // Buffer what we need to go through. - - byte[] arr = new byte[MAX_BYTES_READ]; - - this.inFile.seek(beg); - int totalRead = 0; - for (int left = Math.min((int)(end - beg), arr.length); left > 0;) { - final int r = inFile.read(arr, totalRead, left); - if (r < 0) - break; - totalRead += r; - left -= r; - } - arr = Arrays.copyOf(arr, totalRead); - - this.in = new ByteArraySeekableStream(arr); - - this.bgzf = new BlockCompressedInputStream(this.in); - this.bgzf.setCheckCrcs(true); - - this.bamCodec.setInputStream(bgzf); - - final int firstBGZFEnd = Math.min((int)(end - beg), 0xffff); - - // cp: Compressed Position, indexes the entire BGZF input. - for (int cp = 0;; ++cp) { - final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); - if (psz == null) - return end; - - final int cp0 = cp = psz.pos; - final long cp0Virt = (long)cp0 << 16; - try { - bgzf.seek(cp0Virt); - - // This has to catch Throwable, because it's possible to get an - // OutOfMemoryError due to an overly large size. - } catch (Throwable e) { - // Guessed BGZF position incorrectly: try the next guess. - continue; - } - - // up: Uncompressed Position, indexes the data inside the BGZF block. - for (int up = 0;; ++up) { - final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); - - if (up0 < 0) { - // No BAM records found in the BGZF block: try the next BGZF - // block. - break; - } - - // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth - // of records starting at (cp0,up0). - bgzf.seek(cp0Virt | up0); - boolean decodedAny = false; - try { - byte b = 0; - int prevCP = cp0; - while (b < BLOCKS_NEEDED_FOR_GUESS) - { - SAMRecord record = bamCodec.decode(); - if (record == null) { - break; - } - record.setHeaderStrict(header); - SAMRecordHelper.eagerDecode(record); // force decoding of fields - decodedAny = true; - - final int cp2 = (int)(bgzf.getFilePointer() >>> 16); - if (cp2 != prevCP) { - // The compressed position changed so we must be in a new - // block. - assert cp2 > prevCP; - prevCP = cp2; - ++b; - } - } - - // Running out of records to verify is fine as long as we - // verified at least something. It should only happen if we - // couldn't fill the array. - if (b < BLOCKS_NEEDED_FOR_GUESS) { - assert arr.length < MAX_BYTES_READ; - if (!decodedAny) - continue; - } - } - catch (SAMFormatException e) { continue; } - catch (OutOfMemoryError e) { continue; } - catch (IllegalArgumentException e) { continue; } - catch (IndexOutOfBoundsException e) { continue; } - catch (RuntimeIOException e) { continue; } - // EOF can happen legitimately if the [beg,end) range is too - // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut - // off in the middle of a record. In that case, our stream - // should have hit EOF as well. If we've then verified at least - // something, go ahead with it and hope for the best. - catch (FileTruncatedException e) { - if (!decodedAny && this.in.eof()) - continue; - } - catch (RuntimeEOFException e) { - if (!decodedAny && this.in.eof()) - continue; - } - - return beg+cp0 << 16 | up0; - } - } - } - - private int guessNextBAMPos(long cpVirt, int up, int cSize) { - // What we're actually searching for is what's at offset [4], not [0]. So - // skip ahead by 4, thus ensuring that whenever we find a valid [0] it's - // at position up or greater. - up += 4; - - try { - while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) { - bgzf.seek(cpVirt | up); - IOUtils.readFully(bgzf, buf.array(), 0, 8); - - // If the first two checks fail we have what looks like a valid - // reference sequence ID. Assume we're at offset [4] or [24], i.e. - // the ID of either this read or its mate, respectively. So check - // the next integer ([8] or [28]) to make sure it's a 0-based - // leftmost coordinate. - final int id = buf.getInt(0); - final int pos = buf.getInt(4); - if (id < -1 || id > referenceSequenceCount || pos < -1) { - ++up; - continue; - } - - // Okay, we could be at [4] or [24]. Assuming we're at [4], check - // that [24] is valid. Assume [4] because we should hit it first: - // the only time we expect to hit [24] is at the beginning of the - // split, as part of the first read we should skip. - - bgzf.seek(cpVirt | up+20); - IOUtils.readFully(bgzf, buf.array(), 0, 8); - - final int nid = buf.getInt(0); - final int npos = buf.getInt(4); - if (nid < -1 || nid > referenceSequenceCount || npos < -1) { - ++up; - continue; - } - - // So far so good: [4] and [24] seem okay. Now do something a bit - // more involved: make sure that [36 + [12]&0xff - 1] == 0: that - // is, the name of the read should be null terminated. - - // Move up to 0 just to make it less likely that we get confused - // with offsets. Remember where we should continue from if we - // reject this up. - final int nextUP = up + 1; - up -= 4; - - bgzf.seek(cpVirt | up+12); - IOUtils.readFully(bgzf, buf.array(), 0, 4); - - final int nameLength = buf.getInt(0) & 0xff; - if (nameLength < 1) { - // Names are null-terminated so length must be at least one - up = nextUP; - continue; - } - - final int nullTerminator = up + 36 + nameLength-1; - - if (nullTerminator >= cSize) { - // This BAM record can't fit here. But maybe there's another in - // the remaining space, so try again. - up = nextUP; - continue; - } - - bgzf.seek(cpVirt | nullTerminator); - IOUtils.readFully(bgzf, buf.array(), 0, 1); - - if (buf.get(0) != 0) { - up = nextUP; - continue; - } - - // All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also - // sensible, that's good enough for us. "Sensible" to us means the - // following: - // - // [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff) - - // Note that [0] is "length of the _remainder_ of the alignment - // record", which is why this uses 4*8 instead of 4*9. - int zeroMin = 4*8 + nameLength; - - bgzf.seek(cpVirt | up+16); - IOUtils.readFully(bgzf, buf.array(), 0, 8); - - zeroMin += (buf.getInt(0) & 0xffff) * 4; - zeroMin += buf.getInt(4) + (buf.getInt(4)+1)/2; - - bgzf.seek(cpVirt | up); - IOUtils.readFully(bgzf, buf.array(), 0, 4); - - if (buf.getInt(0) < zeroMin) { - up = nextUP; - continue; - } - return up; - } - } catch (IOException e) {} - return -1; - } - - public static void main(String[] args) throws IOException { - final GenericOptionsParser parser; - try { - parser = new GenericOptionsParser(args); - - // This should be IOException but Hadoop 0.20.2 doesn't throw it... - } catch (Exception e) { - System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage()); - System.exit(1); - - // Hooray for javac - return; - } - - args = parser.getRemainingArgs(); - final Configuration conf = parser.getConfiguration(); - - long beg = 0; - - if (args.length < 2 || args.length > 3) { - System.err.println( - "Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]"); - System.exit(2); - } - - try { - if (args.length > 2) beg = Long.decode(args[2]); - } catch (NumberFormatException e) { - System.err.println("Invalid beg offset."); - if (e.getMessage() != null) - System.err.println(e.getMessage()); - System.exit(2); - } - - SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0])); - SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1])); - - final long end = beg + MAX_BYTES_READ; - - System.out.printf( - "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+ - "Will then verify BAM data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n", - beg, beg + 0xffff, end); - - final long g = - new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end); - - ss.close(); - - if (g == end) { - System.out.println( - "Didn't find any acceptable BAM record in any BGZF block."); - System.exit(1); - } - - System.out.printf( - "Accepted BGZF block at offset %1$#x (%1$d).\n"+ - "Accepted BAM record at offset %2$#x (%2$d) therein.\n", - g >> 16, g & 0xffff); - } + + // We want to go through this many BGZF blocks fully, checking that they + // contain valid BAM records, when guessing a BAM record position. + private static final byte BLOCKS_NEEDED_FOR_GUESS = 3; + // Since the max size of a BGZF block is 0xffff (64K), and we might be just + // one byte off from the start of the previous one, we need 0xfffe bytes for + // the start, and then 0xffff times the number of blocks we want to go + // through. + private static final int MAX_BYTES_READ = BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe; + private static final int SHORTEST_POSSIBLE_BAM_RECORD = 4 * 9 + 1 + 1 + 1; + private final BAMRecordCodec bamCodec; + private final int referenceSequenceCount; + private final SAMFileHeader header; + private SeekableStream inFile; + private BlockCompressedInputStream bgzf; + + /** The stream must point to a valid BAM file, because the header is read from it. */ + public BAMSplitGuesser(SeekableStream ss, Configuration conf) throws IOException { + this(ss, ss, conf); + + // Secondary check that the header points to a BAM file: Picard can get + // things wrong due to its autodetection. + ss.seek(0); + if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC) { + throw new SAMFormatException("Does not seem like a BAM file"); + } + } + + public BAMSplitGuesser(SeekableStream ss, InputStream headerStream, Configuration conf) + throws IOException { + inFile = ss; + + header = SAMHeaderReader.readSAMHeaderFrom(headerStream, conf); + referenceSequenceCount = header.getSequenceDictionary().size(); + + bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory()); + } + + public static void main(String[] args) throws IOException { + final GenericOptionsParser parser; + try { + parser = new GenericOptionsParser(args); + + // This should be IOException but Hadoop 0.20.2 doesn't throw it... + } catch (Exception e) { + System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage()); + System.exit(1); + + // Hooray for javac + return; + } + + args = parser.getRemainingArgs(); + final Configuration conf = parser.getConfiguration(); + + long beg = 0; + + if (args.length < 2 || args.length > 3) { + System.err.println("Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]"); + System.exit(2); + } + + try { + if (args.length > 2) { + beg = Long.decode(args[2]); + } + } catch (NumberFormatException e) { + System.err.println("Invalid beg offset."); + if (e.getMessage() != null) { + System.err.println(e.getMessage()); + } + System.exit(2); + } + + SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0])); + SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1])); + + final long end = beg + MAX_BYTES_READ; + + System.out.printf( + "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n" + + "Will then verify BAM data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n", + beg, beg + 0xffff, end); + + final long g = new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end); + + ss.close(); + + if (g == end) { + System.out.println("Didn't find any acceptable BAM record in any BGZF block."); + System.exit(1); + } + + System.out.printf( + "Accepted BGZF block at offset %1$#x (%1$d).\n" + + "Accepted BAM record at offset %2$#x (%2$d) therein.\n", + g >> 16, g & 0xffff); + } + + /** + * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no + * BAM record was found. + */ + public long guessNextBAMRecordStart(long beg, long end) throws IOException { + // Use a reader to skip through the headers at the beginning of a BAM file, since + // the headers may exceed MAX_BYTES_READ in length. Don't close the reader + // otherwise it will close the underlying stream, which we continue to read from + // on subsequent calls to this method. + if (beg == 0) { + this.inFile.seek(beg); + SamReader open = + SamReaderFactory.makeDefault().setUseAsyncIo(false).open(SamInputResource.of(inFile)); + SAMFileSpan span = open.indexing().getFilePointerSpanningReads(); + if (span instanceof BAMFileSpan) { + return ((BAMFileSpan) span).getFirstOffset(); + } + } + + // Buffer what we need to go through. + + byte[] arr = new byte[MAX_BYTES_READ]; + + this.inFile.seek(beg); + int totalRead = 0; + for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { + final int r = inFile.read(arr, totalRead, left); + if (r < 0) { + break; + } + totalRead += r; + left -= r; + } + arr = Arrays.copyOf(arr, totalRead); + + this.in = new ByteArraySeekableStream(arr); + + this.bgzf = new BlockCompressedInputStream(this.in); + this.bgzf.setCheckCrcs(true); + + this.bamCodec.setInputStream(bgzf); + + final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); + + // cp: Compressed Position, indexes the entire BGZF input. + for (int cp = 0; ; ++cp) { + final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); + if (psz == null) { + return end; + } + + final int cp0 = cp = psz.pos; + final long cp0Virt = (long) cp0 << 16; + try { + bgzf.seek(cp0Virt); + + // This has to catch Throwable, because it's possible to get an + // OutOfMemoryError due to an overly large size. + } catch (Throwable e) { + // Guessed BGZF position incorrectly: try the next guess. + continue; + } + + // up: Uncompressed Position, indexes the data inside the BGZF block. + for (int up = 0; ; ++up) { + final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); + + if (up0 < 0) { + // No BAM records found in the BGZF block: try the next BGZF + // block. + break; + } + + // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth + // of records starting at (cp0,up0). + bgzf.seek(cp0Virt | up0); + boolean decodedAny = false; + try { + byte b = 0; + int prevCP = cp0; + while (b < BLOCKS_NEEDED_FOR_GUESS) { + SAMRecord record = bamCodec.decode(); + if (record == null) { + break; + } + record.setHeaderStrict(header); + SAMRecordHelper.eagerDecode(record); // force decoding of fields + decodedAny = true; + + final int cp2 = (int) (bgzf.getFilePointer() >>> 16); + if (cp2 != prevCP) { + // The compressed position changed so we must be in a new + // block. + assert cp2 > prevCP; + prevCP = cp2; + ++b; + } + } + + // Running out of records to verify is fine as long as we + // verified at least something. It should only happen if we + // couldn't fill the array. + if (b < BLOCKS_NEEDED_FOR_GUESS) { + assert arr.length < MAX_BYTES_READ; + if (!decodedAny) { + continue; + } + } + } catch (SAMFormatException e) { + continue; + } catch (OutOfMemoryError e) { + continue; + } catch (IllegalArgumentException e) { + continue; + } catch (IndexOutOfBoundsException e) { + continue; + } catch (RuntimeIOException e) { + continue; + } + // EOF can happen legitimately if the [beg,end) range is too + // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut + // off in the middle of a record. In that case, our stream + // should have hit EOF as well. If we've then verified at least + // something, go ahead with it and hope for the best. + catch (FileTruncatedException e) { + if (!decodedAny && this.in.eof()) { + continue; + } + } catch (RuntimeEOFException e) { + if (!decodedAny && this.in.eof()) { + continue; + } + } + + return beg + cp0 << 16 | up0; + } + } + } + + private int guessNextBAMPos(long cpVirt, int up, int cSize) { + // What we're actually searching for is what's at offset [4], not [0]. So + // skip ahead by 4, thus ensuring that whenever we find a valid [0] it's + // at position up or greater. + up += 4; + + try { + while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) { + bgzf.seek(cpVirt | up); + IOUtils.readFully(bgzf, buf.array(), 0, 8); + + // If the first two checks fail we have what looks like a valid + // reference sequence ID. Assume we're at offset [4] or [24], i.e. + // the ID of either this read or its mate, respectively. So check + // the next integer ([8] or [28]) to make sure it's a 0-based + // leftmost coordinate. + final int id = buf.getInt(0); + final int pos = buf.getInt(4); + if (id < -1 || id > referenceSequenceCount || pos < -1) { + ++up; + continue; + } + + // Okay, we could be at [4] or [24]. Assuming we're at [4], check + // that [24] is valid. Assume [4] because we should hit it first: + // the only time we expect to hit [24] is at the beginning of the + // split, as part of the first read we should skip. + + bgzf.seek(cpVirt | up + 20); + IOUtils.readFully(bgzf, buf.array(), 0, 8); + + final int nid = buf.getInt(0); + final int npos = buf.getInt(4); + if (nid < -1 || nid > referenceSequenceCount || npos < -1) { + ++up; + continue; + } + + // So far so good: [4] and [24] seem okay. Now do something a bit + // more involved: make sure that [36 + [12]&0xff - 1] == 0: that + // is, the name of the read should be null terminated. + + // Move up to 0 just to make it less likely that we get confused + // with offsets. Remember where we should continue from if we + // reject this up. + final int nextUP = up + 1; + up -= 4; + + bgzf.seek(cpVirt | up + 12); + IOUtils.readFully(bgzf, buf.array(), 0, 4); + + final int nameLength = buf.getInt(0) & 0xff; + if (nameLength < 1) { + // Names are null-terminated so length must be at least one + up = nextUP; + continue; + } + + final int nullTerminator = up + 36 + nameLength - 1; + + if (nullTerminator >= cSize) { + // This BAM record can't fit here. But maybe there's another in + // the remaining space, so try again. + up = nextUP; + continue; + } + + bgzf.seek(cpVirt | nullTerminator); + IOUtils.readFully(bgzf, buf.array(), 0, 1); + + if (buf.get(0) != 0) { + up = nextUP; + continue; + } + + // All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also + // sensible, that's good enough for us. "Sensible" to us means the + // following: + // + // [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff) + + // Note that [0] is "length of the _remainder_ of the alignment + // record", which is why this uses 4*8 instead of 4*9. + int zeroMin = 4 * 8 + nameLength; + + bgzf.seek(cpVirt | up + 16); + IOUtils.readFully(bgzf, buf.array(), 0, 8); + + zeroMin += (buf.getInt(0) & 0xffff) * 4; + zeroMin += buf.getInt(4) + (buf.getInt(4) + 1) / 2; + + bgzf.seek(cpVirt | up); + IOUtils.readFully(bgzf, buf.array(), 0, 4); + + if (buf.getInt(0) < zeroMin) { + up = nextUP; + continue; + } + return up; + } + } catch (IOException e) { + } + return -1; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java index 5ed4a7c..6b6b626 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java @@ -22,13 +22,18 @@ package org.seqdoop.hadoop_bam; -import java.io.InputStream; +import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.variant.bcf2.BCF2Codec; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; +import java.io.InputStream; import java.util.HashMap; import java.util.Map; - -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; @@ -36,201 +41,213 @@ import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; - -import htsjdk.samtools.util.BlockCompressedInputStream; -import htsjdk.tribble.readers.PositionalBufferedStream; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.bcf2.BCF2Codec; -import htsjdk.variant.vcf.VCFContigHeaderLine; -import htsjdk.variant.vcf.VCFHeader; - import org.seqdoop.hadoop_bam.util.MurmurHash3; import org.seqdoop.hadoop_bam.util.WrapSeekable; /** See {@link VCFRecordReader} for the meaning of the key. */ -public class BCFRecordReader - extends RecordReader -{ - private final LongWritable key = new LongWritable(); - private final VariantContextWritable vc = new VariantContextWritable(); - - private BCF2Codec codec = new BCF2Codec(); - private PositionalBufferedStream in; - - private final Map contigDict = - new HashMap(); - - private boolean isBGZF; - private BlockCompressedInputStream bci; - - // If isBGZF, length refers only to the distance of the last BGZF block from - // the first. - private long fileStart, length; - - @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) - throws IOException - { - isBGZF = spl instanceof FileVirtualSplit; - if (isBGZF) { - final FileVirtualSplit split = (FileVirtualSplit)spl; - - final Path file = split.getPath(); - final FileSystem fs = file.getFileSystem(ctx.getConfiguration()); - - final FSDataInputStream inFile = fs.open(file); - - bci = new BlockCompressedInputStream(inFile); - in = new PositionalBufferedStream(bci); - initContigDict(); - - inFile.seek(0); - bci = - new BlockCompressedInputStream( - new WrapSeekable( - inFile, fs.getFileStatus(file).getLen(), file)); - - final long virtualStart = split.getStartVirtualOffset(), - virtualEnd = split.getEndVirtualOffset(); - - this.fileStart = virtualStart >>> 16; - this.length = (virtualEnd >>> 16) - fileStart; - - bci.seek(virtualStart); - - // Since PositionalBufferedStream does its own buffering, we have to - // prevent it from going too far by using a BGZFLimitingStream. It - // also allows nextKeyValue() to simply check for EOF instead of - // looking at virtualEnd. - in = new PositionalBufferedStream( - new BGZFLimitingStream(bci, virtualEnd)); - } else { - final FileSplit split = (FileSplit)spl; - - this.fileStart = split.getStart(); - this.length = split.getLength(); - - final Path file = split.getPath(); - - in = new PositionalBufferedStream( - file.getFileSystem(ctx.getConfiguration()).open(file)); - - initContigDict(); - - IOUtils.skipFully(in, fileStart - in.getPosition()); - } - } - @Override public void close() throws IOException { in.close(); } - - private void initContigDict() { - final VCFHeader header = - (VCFHeader)codec.readHeader(in).getHeaderValue(); - - contigDict.clear(); - int i = 0; - for (final VCFContigHeaderLine contig : header.getContigLines()) - contigDict.put(contig.getID(), i++); - } - - /** For compressed BCF, unless the end has been reached, this is quite - * inaccurate. - */ - @Override public float getProgress() { - if (length == 0) - return 1; - - if (!isBGZF) - return (float)(in.getPosition() - fileStart) / length; - - try { - if (in.peek() == -1) - return 1; - } catch (IOException e) { - return 1; - } - - // Add 1 to the denominator to make sure that we never report 1 here. - return (float)((bci.getFilePointer() >>> 16) - fileStart) / (length + 1); - } - @Override public LongWritable getCurrentKey () { return key; } - @Override public VariantContextWritable getCurrentValue() { return vc; } - - @Override public boolean nextKeyValue() throws IOException { - if (in.peek() == -1) - return false; - - if (!isBGZF && in.getPosition() >= fileStart + length) - return false; - - final VariantContext v = codec.decode(in); - - Integer chromIdx = contigDict.get(v.getContig()); - if (chromIdx == null) - chromIdx = (int)MurmurHash3.murmurhash3(v.getContig(), 0); - - key.set((long)chromIdx << 32 | (long)(v.getStart() - 1)); - vc.set(v); - return true; - } +public class BCFRecordReader extends RecordReader { + + private final LongWritable key = new LongWritable(); + private final VariantContextWritable vc = new VariantContextWritable(); + private final Map contigDict = new HashMap(); + private BCF2Codec codec = new BCF2Codec(); + private PositionalBufferedStream in; + private boolean isBGZF; + private BlockCompressedInputStream bci; + + // If isBGZF, length refers only to the distance of the last BGZF block from + // the first. + private long fileStart, length; + + @Override + public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { + isBGZF = spl instanceof FileVirtualSplit; + if (isBGZF) { + final FileVirtualSplit split = (FileVirtualSplit) spl; + + final Path file = split.getPath(); + final FileSystem fs = file.getFileSystem(ctx.getConfiguration()); + + final FSDataInputStream inFile = fs.open(file); + + bci = new BlockCompressedInputStream(inFile); + in = new PositionalBufferedStream(bci); + initContigDict(); + + inFile.seek(0); + bci = + new BlockCompressedInputStream( + new WrapSeekable(inFile, fs.getFileStatus(file).getLen(), file)); + + final long virtualStart = split.getStartVirtualOffset(), + virtualEnd = split.getEndVirtualOffset(); + + this.fileStart = virtualStart >>> 16; + this.length = (virtualEnd >>> 16) - fileStart; + + bci.seek(virtualStart); + + // Since PositionalBufferedStream does its own buffering, we have to + // prevent it from going too far by using a BGZFLimitingStream. It + // also allows nextKeyValue() to simply check for EOF instead of + // looking at virtualEnd. + in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd)); + } else { + final FileSplit split = (FileSplit) spl; + + this.fileStart = split.getStart(); + this.length = split.getLength(); + + final Path file = split.getPath(); + + in = new PositionalBufferedStream(file.getFileSystem(ctx.getConfiguration()).open(file)); + + initContigDict(); + + IOUtils.skipFully(in, fileStart - in.getPosition()); + } + } + + @Override + public void close() throws IOException { + in.close(); + } + + private void initContigDict() { + final VCFHeader header = (VCFHeader) codec.readHeader(in).getHeaderValue(); + + contigDict.clear(); + int i = 0; + for (final VCFContigHeaderLine contig : header.getContigLines()) { + contigDict.put(contig.getID(), i++); + } + } + + /** For compressed BCF, unless the end has been reached, this is quite inaccurate. */ + @Override + public float getProgress() { + if (length == 0) { + return 1; + } + + if (!isBGZF) { + return (float) (in.getPosition() - fileStart) / length; + } + + try { + if (in.peek() == -1) { + return 1; + } + } catch (IOException e) { + return 1; + } + + // Add 1 to the denominator to make sure that we never report 1 here. + return (float) ((bci.getFilePointer() >>> 16) - fileStart) / (length + 1); + } + + @Override + public LongWritable getCurrentKey() { + return key; + } + + @Override + public VariantContextWritable getCurrentValue() { + return vc; + } + + @Override + public boolean nextKeyValue() throws IOException { + if (in.peek() == -1) { + return false; + } + + if (!isBGZF && in.getPosition() >= fileStart + length) { + return false; + } + + final VariantContext v = codec.decode(in); + + Integer chromIdx = contigDict.get(v.getContig()); + if (chromIdx == null) { + chromIdx = (int) MurmurHash3.murmurhash3(v.getContig(), 0); + } + + key.set((long) chromIdx << 32 | (long) (v.getStart() - 1)); + vc.set(v); + return true; + } } class BGZFLimitingStream extends InputStream { - private final BlockCompressedInputStream bgzf; - private final long virtEnd; - - public BGZFLimitingStream( - BlockCompressedInputStream stream, long virtualEnd) - { - bgzf = stream; - virtEnd = virtualEnd; - } - - @Override public void close() throws IOException { bgzf.close(); } - - private byte[] readBuf = new byte[1]; - @Override public int read() throws IOException { - switch (read(readBuf)) { - case 1: return readBuf[0]; - case -1: return -1; - default: assert false; return -1; - } - } - - @Override public int read(byte[] buf, int off, int len) throws IOException { - - int totalRead = 0; - long virt; - - final int lastLen = (int)virtEnd & 0xffff; - - while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { - // We're not in the last BGZF block yet. Unfortunately - // BlockCompressedInputStream doesn't expose the length of the current - // block, so we can't simply (possibly repeatedly) read the current - // block to the end. Instead, we read at most virtEnd & 0xffff at a - // time, which ensures that we can't overshoot virtEnd even if the - // next block starts immediately. - final int r = bgzf.read(buf, off, Math.min(len, lastLen)); - if (r == -1) - return totalRead == 0 ? -1 : totalRead; - - totalRead += r; - len -= r; - if (len == 0) - return totalRead; - off += r; - } - - // We're in the last BGZF block: read only up to lastLen. - len = Math.min(len, ((int)virt & 0xffff) - lastLen); - while (len > 0) { - final int r = bgzf.read(buf, off, len); - if (r == -1) - return totalRead == 0 ? -1 : totalRead; - - totalRead += r; - len -= r; - off += r; - } - return totalRead == 0 ? -1 : totalRead; - } + + private final BlockCompressedInputStream bgzf; + private final long virtEnd; + private byte[] readBuf = new byte[1]; + + public BGZFLimitingStream(BlockCompressedInputStream stream, long virtualEnd) { + bgzf = stream; + virtEnd = virtualEnd; + } + + @Override + public void close() throws IOException { + bgzf.close(); + } + + @Override + public int read() throws IOException { + switch (read(readBuf)) { + case 1: + return readBuf[0]; + case -1: + return -1; + default: + assert false; + return -1; + } + } + + @Override + public int read(byte[] buf, int off, int len) throws IOException { + + int totalRead = 0; + long virt; + + final int lastLen = (int) virtEnd & 0xffff; + + while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { + // We're not in the last BGZF block yet. Unfortunately + // BlockCompressedInputStream doesn't expose the length of the current + // block, so we can't simply (possibly repeatedly) read the current + // block to the end. Instead, we read at most virtEnd & 0xffff at a + // time, which ensures that we can't overshoot virtEnd even if the + // next block starts immediately. + final int r = bgzf.read(buf, off, Math.min(len, lastLen)); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + if (len == 0) { + return totalRead; + } + off += r; + } + + // We're in the last BGZF block: read only up to lastLen. + len = Math.min(len, ((int) virt & 0xffff) - lastLen); + while (len > 0) { + final int r = bgzf.read(buf, off, len); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + off += r; + } + return totalRead == 0 ? -1 : totalRead; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java index 72733ee..0ec2664 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java @@ -22,119 +22,106 @@ package org.seqdoop.hadoop_bam; -import java.io.FilterOutputStream; -import java.io.IOException; -import java.io.OutputStream; - import htsjdk.samtools.util.BlockCompressedOutputStream; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFHeader; - +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.seqdoop.hadoop_bam.util.VCFHeaderReader; import org.seqdoop.hadoop_bam.util.WrapSeekable; -/** A base {@link RecordWriter} for compressed BCF. +/** + * A base {@link RecordWriter} for compressed BCF. * - *

Handles the output stream, writing the header if requested, and provides - * the {@link #writeRecord} function for subclasses.

+ *

Handles the output stream, writing the header if requested, and provides the {@link + * #writeRecord} function for subclasses. */ -public abstract class BCFRecordWriter - extends RecordWriter -{ - private VariantContextWriter writer; - private LazyVCFGenotypesContext.HeaderDataCache vcfHeaderDataCache = - new LazyVCFGenotypesContext.HeaderDataCache(); - private LazyBCFGenotypesContext.HeaderDataCache bcfHeaderDataCache = - new LazyBCFGenotypesContext.HeaderDataCache(); - - /** A VCF header is read from the input Path, which should refer to a VCF or - * BCF file. - */ - public BCFRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - final WrapSeekable in = - WrapSeekable.openPath(ctx.getConfiguration(), input); - final VCFHeader header = VCFHeaderReader.readHeaderFrom(in); - in.close(); - - init(output, header, writeHeader, ctx); - } - public BCFRecordWriter( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - } - public BCFRecordWriter( - OutputStream output, VCFHeader header, boolean writeHeader) - throws IOException - { - init(output, header, writeHeader); - } - - // Working around not being able to call a constructor other than as the - // first statement... - private void init( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - } - private void init( - OutputStream output, VCFHeader header, final boolean writeHeader) - throws IOException - { - final BCFStoppableOutputStream stopOut = - new BCFStoppableOutputStream(!writeHeader, output); - - writer = new VariantContextWriterBuilder().clearOptions() - .setOption(Options.FORCE_BCF) - .setOutputBCFStream(stopOut).build(); - - writer.writeHeader(header); - stopOut.stopped = false; - - setInputHeader(header); - } - - @Override public void close(TaskAttemptContext ctx) throws IOException { - writer.close(); - } - - /** Used for lazy decoding of genotype data. Of course, each input record - * may have a different header, but we currently only support one header - * here... This is in part due to the fact that it's not clear what the best - * solution is. */ - public void setInputHeader(VCFHeader header) { - vcfHeaderDataCache.setHeader(header); - bcfHeaderDataCache.setHeader(header); - } - - protected void writeRecord(VariantContext vc) { - final GenotypesContext gc = vc.getGenotypes(); - if (gc instanceof LazyParsingGenotypesContext) - ((LazyParsingGenotypesContext)gc).getParser().setHeaderDataCache( - gc instanceof LazyVCFGenotypesContext ? vcfHeaderDataCache - : bcfHeaderDataCache); - - writer.add(vc); - } +public abstract class BCFRecordWriter extends RecordWriter { + + private VariantContextWriter writer; + private LazyVCFGenotypesContext.HeaderDataCache vcfHeaderDataCache = + new LazyVCFGenotypesContext.HeaderDataCache(); + private LazyBCFGenotypesContext.HeaderDataCache bcfHeaderDataCache = + new LazyBCFGenotypesContext.HeaderDataCache(); + + /** A VCF header is read from the input Path, which should refer to a VCF or BCF file. */ + public BCFRecordWriter(Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + final WrapSeekable in = WrapSeekable.openPath(ctx.getConfiguration(), input); + final VCFHeader header = VCFHeaderReader.readHeaderFrom(in); + in.close(); + + init(output, header, writeHeader, ctx); + } + + public BCFRecordWriter(Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + } + + public BCFRecordWriter(OutputStream output, VCFHeader header, boolean writeHeader) + throws IOException { + init(output, header, writeHeader); + } + + // Working around not being able to call a constructor other than as the + // first statement... + private void init(Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + } + + private void init(OutputStream output, VCFHeader header, final boolean writeHeader) + throws IOException { + final BCFStoppableOutputStream stopOut = new BCFStoppableOutputStream(!writeHeader, output); + + writer = + new VariantContextWriterBuilder() + .clearOptions() + .setOption(Options.FORCE_BCF) + .setOutputBCFStream(stopOut) + .build(); + + writer.writeHeader(header); + stopOut.stopped = false; + + setInputHeader(header); + } + + @Override + public void close(TaskAttemptContext ctx) throws IOException { + writer.close(); + } + + /** + * Used for lazy decoding of genotype data. Of course, each input record may have a different + * header, but we currently only support one header here... This is in part due to the fact that + * it's not clear what the best solution is. + */ + public void setInputHeader(VCFHeader header) { + vcfHeaderDataCache.setHeader(header); + bcfHeaderDataCache.setHeader(header); + } + + protected void writeRecord(VariantContext vc) { + final GenotypesContext gc = vc.getGenotypes(); + if (gc instanceof LazyParsingGenotypesContext) { + ((LazyParsingGenotypesContext) gc) + .getParser() + .setHeaderDataCache( + gc instanceof LazyVCFGenotypesContext ? vcfHeaderDataCache : bcfHeaderDataCache); + } + + writer.add(vc); + } } // We must always call writer.writeHeader() because the writer requires @@ -148,31 +135,44 @@ protected void writeRecord(VariantContext vc) { // // In addition we do BGZF compression here, to simplify things. final class BCFStoppableOutputStream extends FilterOutputStream { - public boolean stopped; - private final OutputStream origOut; - - public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { - super(new BlockCompressedOutputStream(out, null)); - origOut = out; - stopped = startStopped; - } - - @Override public void write(int b) throws IOException { - if (!stopped) super.write(b); - } - @Override public void write(byte[] b) throws IOException { - if (!stopped) super.write(b); - } - @Override public void write(byte[] b, int off, int len) throws IOException { - if (!stopped) super.write(b, off, len); - } - - @Override public void close() throws IOException { - // Don't close the BlockCompressedOutputStream, as we don't want - // the BGZF terminator. - this.out.flush(); - - // Instead, close the lower-level output stream directly. - origOut.close(); - } + + private final OutputStream origOut; + public boolean stopped; + + public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { + super(new BlockCompressedOutputStream(out, null)); + origOut = out; + stopped = startStopped; + } + + @Override + public void write(int b) throws IOException { + if (!stopped) { + super.write(b); + } + } + + @Override + public void write(byte[] b) throws IOException { + if (!stopped) { + super.write(b); + } + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + if (!stopped) { + super.write(b, off, len); + } + } + + @Override + public void close() throws IOException { + // Don't close the BlockCompressedOutputStream, as we don't want + // the BGZF terminator. + this.out.flush(); + + // Instead, close the lower-level output stream directly. + origOut.close(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFSplitGuesser.java b/src/main/java/org/seqdoop/hadoop_bam/BCFSplitGuesser.java index 70cb533..5071da8 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFSplitGuesser.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFSplitGuesser.java @@ -22,18 +22,8 @@ package org.seqdoop.hadoop_bam; -import htsjdk.samtools.seekablestream.ByteArraySeekableStream; -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IOUtils; -import org.apache.hadoop.util.GenericOptionsParser; - import htsjdk.samtools.FileTruncatedException; +import htsjdk.samtools.seekablestream.ByteArraySeekableStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.RuntimeEOFException; @@ -41,402 +31,421 @@ import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.vcf.VCFHeader; - +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.util.GenericOptionsParser; import org.seqdoop.hadoop_bam.util.WrapSeekable; -/** A class for heuristically finding BCF record positions inside an area of - * a BCF file. Handles both compressed and uncompressed BCF. +/** + * A class for heuristically finding BCF record positions inside an area of a BCF file. Handles both + * compressed and uncompressed BCF. */ public class BCFSplitGuesser extends BaseSplitGuesser { - // cin is the compressed input: a BlockCompressedInputStream for compressed - // BCF, otherwise equal to in. Unfortunately the closest common type is then - // InputStream, which is why we have the cinSeek() method. - private InputStream cin; - private SeekableStream inFile; - private final boolean bgzf; - private final BCF2Codec bcfCodec = new BCF2Codec(); - private final int contigDictionaryLength, genotypeSampleCount; - - // The amount of data we verify for uncompressed BCF. - private final static int UNCOMPRESSED_BYTES_NEEDED = 0x80000; - - // We want to go through this many BGZF blocks fully, checking that they - // contain valid BCF records, when guessing a BCF record position. - private final static byte BGZF_BLOCKS_NEEDED_FOR_GUESS = 2; - - // Since the max size of a BGZF block is 0xffff (64K), and we might be just - // one byte off from the start of the previous one, we need 0xfffe bytes for - // the start, and then 0xffff times the number of blocks we want to go - // through. - private final static int BGZF_MAX_BYTES_READ = - BGZF_BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe; - - // This is probably too conservative. - private final static int SHORTEST_POSSIBLE_BCF_RECORD = 4*8 + 1; - - /** The stream must point to a valid BCF file, because the header is read - * from it. - */ - public BCFSplitGuesser(SeekableStream ss) throws IOException { - this(ss, ss); - } - - public BCFSplitGuesser(SeekableStream ss, InputStream headerStream) - throws IOException - { - inFile = ss; - - InputStream bInFile = new BufferedInputStream(inFile); - - bgzf = BlockCompressedInputStream.isValidFile(bInFile); - if (bgzf) - bInFile = new BlockCompressedInputStream(bInFile); - - // Excess buffering here but it can't be helped that BCF2Codec only takes - // PositionalBufferedStream. - final VCFHeader header = - (VCFHeader)bcfCodec.readHeader( - new PositionalBufferedStream(bInFile)).getHeaderValue(); - - contigDictionaryLength = header.getContigLines().size(); - genotypeSampleCount = header.getNGenotypeSamples(); - } - - public boolean isBGZF() { return bgzf; } - - private void cinSeek(long virt) throws IOException { - if (bgzf) - ((BlockCompressedInputStream)cin).seek(virt); - else - ((SeekableStream)cin).seek(virt); - } - - /** Finds a (virtual in the case of BGZF) BCF record position in the - * physical position range [beg,end). Returns end if no BCF record was - * found. - */ - public long guessNextBCFRecordStart(long beg, long end) - throws IOException - { - // Buffer what we need to go through. - - byte[] arr = new byte[ - bgzf ? BGZF_MAX_BYTES_READ : UNCOMPRESSED_BYTES_NEEDED]; - - this.inFile.seek(beg); - int totalRead = 0; - for (int left = Math.min((int)(end - beg), arr.length); left > 0;) { - final int r = inFile.read(arr, totalRead, left); - if (r < 0) - break; - totalRead += r; - left -= r; - } - arr = Arrays.copyOf(arr, totalRead); - - this.in = new ByteArraySeekableStream(arr); - - final int firstBGZFEnd; - - if (this.bgzf) { - firstBGZFEnd = Math.min((int)(end - beg), 0xffff); - - BlockCompressedInputStream bgzfStream = - new BlockCompressedInputStream(this.in); - bgzfStream.setCheckCrcs(true); - this.cin = bgzfStream; - } else { - this.cin = this.in; - - firstBGZFEnd = 0; // Actually unused - } - - // cp: Compressed Position, indexes the entire BGZF input. If - // we have uncompressed BCF, this loop does nothing. - for (int cp = 0;; ++cp) { - - final int cp0; - final long cp0Virt; - final int blockLen; - - if (this.bgzf) { - final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); - if (psz == null) - break; - - cp0 = cp = psz.pos; - cp0Virt = (long)cp0 << 16; - try { - cinSeek(cp0Virt); - - // This has to catch Throwable, because it's possible to get an - // OutOfMemoryError due to an overly large size. - } catch (Throwable e) { - // Guessed BGZF position incorrectly: try the next guess. - continue; - } - blockLen = psz.size; - } else { - cp0 = 0; // Actually unused - cp0Virt = 0; - blockLen = Math.max(arr.length, UNCOMPRESSED_BYTES_NEEDED); - } - - // up: Uncompressed Position, indexes the data inside the BGZF block. - for (int up = 0;; ++up) { - final int up0 = up = guessNextBCFPos(cp0Virt, up, blockLen); - - if (up0 < 0) { - // No BCF records found in the BGZF block: try the next BGZF - // block. - break; - } - - // Verification time. - - cinSeek(cp0Virt | up0); - - final PositionalBufferedStream pbIn = - new PositionalBufferedStream(cin); - - boolean decodedAny = false; - try { - if (bgzf) { - byte b = 0; - int prevCP = cp0; - while (b < BGZF_BLOCKS_NEEDED_FOR_GUESS && pbIn.peek() != -1) - { - bcfCodec.decode(pbIn); - decodedAny = true; - - final int cp2 = (int) - (((BlockCompressedInputStream)cin).getFilePointer() - >>> 16); - if (cp2 != prevCP) { - // The compressed position changed so we must be in a - // new block. - assert cp2 > prevCP; - cp = cp2; - ++b; - } - } - - // Running out of records to verify is fine as long as we - // verified at least something. It should only happen if we - // couldn't fill the array. - if (b < BGZF_BLOCKS_NEEDED_FOR_GUESS) { - assert arr.length < BGZF_MAX_BYTES_READ; - if (!decodedAny) - continue; - } - } else { - while (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED - && pbIn.peek() != -1) - { - bcfCodec.decode(pbIn); - decodedAny = true; - } - - // As in the BGZF case. - if (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED) { - assert arr.length < UNCOMPRESSED_BYTES_NEEDED; - if (!decodedAny) - continue; - } - } - - } catch (FileTruncatedException e) { continue; } - catch (OutOfMemoryError e) { continue; } - catch (RuntimeEOFException e) { continue; } - catch (TribbleException e) { - // This is the way in which BCF2Codec reports unexpected EOF. - // Unfortunately, it also reports every other kind of error with - // the same exception. It even wraps IOException in - // TribbleException! - // - // We need to catch EOF in the middle of a record, which can - // happen legitimately if the [beg,end) range is too small and - // cuts off a record. First, require decodedAny, and then, assume - // that this exception means EOF if the stream has hit EOF. - if (!(decodedAny && pbIn.peek() == -1)) - continue; - } - - return this.bgzf ? beg+cp0 << 16 | up0 : beg + up0; - } - if (!this.bgzf) - break; - } - return end; - } - - private int guessNextBCFPos(long cpVirt, int up, int cSize) { - try { - for (; up + SHORTEST_POSSIBLE_BCF_RECORD < cSize; ++up) { - // Note! The BCF2 spec has a table listing the fields and their - // types, but QUAL is misplaced there! It should be before - // n_allele_info, not after n_fmt_sample! The "Putting it all - // together" section shows the correct field order. - - // Check that [0] and [4] are big enough to make sense. - - cinSeek(cpVirt | up); - IOUtils.readFully(cin, buf.array(), 0, 8); - - final long sharedLen = getUInt(0); - final long indivLen = getUInt(4); - if (sharedLen + indivLen < (long)SHORTEST_POSSIBLE_BCF_RECORD) - continue; - - // Check that [8] looks like a valid CHROM field and that [12] is a - // 0-based leftmost coordinate. - - cinSeek(cpVirt | up+8); - IOUtils.readFully(cin, buf.array(), 0, 8); - - final int chrom = buf.getInt(0); - final int pos = buf.getInt(4); - if (chrom < 0 || chrom >= contigDictionaryLength || pos < 0) - continue; - - // [24] and [26] are lengths and should thus be nonnegative. - - cinSeek(cpVirt | up+24); - IOUtils.readFully(cin, buf.array(), 0, 4); - final int alleleInfo = buf.getInt(0); - - final int alleleCount = alleleInfo >> 16; - final int infoCount = alleleInfo & 0xffff; - if (alleleCount < 0) // don't check infoCount since it is always nonnegative - continue; - - // Make sure that [28] matches to the same value in the header. - - cinSeek(cpVirt | up+28); - IOUtils.readFully(cin, buf.array(), 0, 1); - - final short nSamples = getUByte(0); - if ((int)nSamples != genotypeSampleCount) - continue; - - // Check that the ID string has a sensible type encoding. That is, - // it should claim to be a character string: [32] & 0x0f == 0x07. - // Further, if it has length 15 or more, i.e. [32] & 0xf0 == 0xf0, - // then it should be followed by an integer, i.e. [33] & 0x0f - // should be in the range [1, 3], and the value of that integer - // should be in the range [15, [0] - x) where x is the guaranteed - // number of bytes in the first part of this record (before the - // genotype block). - - cinSeek(cpVirt | up+32); - IOUtils.readFully(cin, buf.array(), 0, 6); - - final byte idType = buf.get(0); - if ((idType & 0x0f) != 0x07) - continue; - - if ((idType & 0xf0) == 0xf0) { - final byte idLenType = buf.get(1); - final long idLen; - switch (idLenType & 0x0f) { - case 0x01: idLen = getUByte (2); break; - case 0x02: idLen = getUShort(2); break; - case 0x03: idLen = getUInt (2); break; - default: continue; - } - - if (idLen < 15 - || idLen > sharedLen - (4*8 + alleleCount + infoCount*2)) - continue; - } - - // Good enough. - return up; - } - } catch (IOException e) { - // fall through - } - return -1; - } - private long getUInt(final int idx) { - return (long)buf.getInt(idx) & 0xffffffff; - } - private short getUByte(final int idx) { - return (short)((short)buf.get(idx) & 0xff); - } - - public static void main(String[] args) throws IOException { - final GenericOptionsParser parser; - try { - parser = new GenericOptionsParser(args); - - // This should be IOException but Hadoop 0.20.2 doesn't throw it... - } catch (Exception e) { - System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage()); - System.exit(1); - return; - } - - args = parser.getRemainingArgs(); - final Configuration conf = parser.getConfiguration(); - - long beg = 0; - - if (args.length < 2 || args.length > 3) { - System.err.println( - "Usage: BCFSplitGuesser path-or-uri header-path-or-uri [beg]"); - System.exit(2); - } - - try { - if (args.length > 2) beg = Long.decode(args[2]); - } catch (NumberFormatException e) { - System.err.println("Invalid beg offset."); - if (e.getMessage() != null) - System.err.println(e.getMessage()); - System.exit(2); - } - - SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0])); - SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1])); - - final BCFSplitGuesser guesser = new BCFSplitGuesser(ss, hs); - final long end; - - if (guesser.isBGZF()) { - end = beg + BGZF_MAX_BYTES_READ; - - System.out.printf( - "This looks like a BGZF-compressed BCF file.\n"+ - "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+ - "Will then verify BCF data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n", - beg, beg + 0xffff, end); - } else { - end = beg + UNCOMPRESSED_BYTES_NEEDED; - - System.out.printf( - "This looks like an uncompressed BCF file.\n"+ - "Will look for a BCF record within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+ - "And then will verify all following data in that range.\n", - beg, end); - } - - final long g = guesser.guessNextBCFRecordStart(beg, end); - - ss.close(); - - if (g == end) { - System.out.println( - "Didn't find any acceptable BCF record in any BGZF block."); - System.exit(1); - } - - if (guesser.isBGZF()) - System.out.printf( - "Accepted BGZF block at offset %1$#x (%1$d).\n"+ - "Accepted BCF record at offset %2$#x (%2$d) therein.\n", - g >> 16, g & 0xffff); - else - System.out.printf("Accepted BCF record at offset %1$#x (%1$d).\n", g); - } + + // The amount of data we verify for uncompressed BCF. + private static final int UNCOMPRESSED_BYTES_NEEDED = 0x80000; + // We want to go through this many BGZF blocks fully, checking that they + // contain valid BCF records, when guessing a BCF record position. + private static final byte BGZF_BLOCKS_NEEDED_FOR_GUESS = 2; + // Since the max size of a BGZF block is 0xffff (64K), and we might be just + // one byte off from the start of the previous one, we need 0xfffe bytes for + // the start, and then 0xffff times the number of blocks we want to go + // through. + private static final int BGZF_MAX_BYTES_READ = BGZF_BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe; + // This is probably too conservative. + private static final int SHORTEST_POSSIBLE_BCF_RECORD = 4 * 8 + 1; + private final boolean bgzf; + private final BCF2Codec bcfCodec = new BCF2Codec(); + private final int contigDictionaryLength, genotypeSampleCount; + // cin is the compressed input: a BlockCompressedInputStream for compressed + // BCF, otherwise equal to in. Unfortunately the closest common type is then + // InputStream, which is why we have the cinSeek() method. + private InputStream cin; + private SeekableStream inFile; + + /** The stream must point to a valid BCF file, because the header is read from it. */ + public BCFSplitGuesser(SeekableStream ss) throws IOException { + this(ss, ss); + } + + public BCFSplitGuesser(SeekableStream ss, InputStream headerStream) throws IOException { + inFile = ss; + + InputStream bInFile = new BufferedInputStream(inFile); + + bgzf = BlockCompressedInputStream.isValidFile(bInFile); + if (bgzf) { + bInFile = new BlockCompressedInputStream(bInFile); + } + + // Excess buffering here but it can't be helped that BCF2Codec only takes + // PositionalBufferedStream. + final VCFHeader header = + (VCFHeader) bcfCodec.readHeader(new PositionalBufferedStream(bInFile)).getHeaderValue(); + + contigDictionaryLength = header.getContigLines().size(); + genotypeSampleCount = header.getNGenotypeSamples(); + } + + public static void main(String[] args) throws IOException { + final GenericOptionsParser parser; + try { + parser = new GenericOptionsParser(args); + + // This should be IOException but Hadoop 0.20.2 doesn't throw it... + } catch (Exception e) { + System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage()); + System.exit(1); + return; + } + + args = parser.getRemainingArgs(); + final Configuration conf = parser.getConfiguration(); + + long beg = 0; + + if (args.length < 2 || args.length > 3) { + System.err.println("Usage: BCFSplitGuesser path-or-uri header-path-or-uri [beg]"); + System.exit(2); + } + + try { + if (args.length > 2) { + beg = Long.decode(args[2]); + } + } catch (NumberFormatException e) { + System.err.println("Invalid beg offset."); + if (e.getMessage() != null) { + System.err.println(e.getMessage()); + } + System.exit(2); + } + + SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0])); + SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1])); + + final BCFSplitGuesser guesser = new BCFSplitGuesser(ss, hs); + final long end; + + if (guesser.isBGZF()) { + end = beg + BGZF_MAX_BYTES_READ; + + System.out.printf( + "This looks like a BGZF-compressed BCF file.\n" + + "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n" + + "Will then verify BCF data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n", + beg, beg + 0xffff, end); + } else { + end = beg + UNCOMPRESSED_BYTES_NEEDED; + + System.out.printf( + "This looks like an uncompressed BCF file.\n" + + "Will look for a BCF record within: [%1$#x,%2$#x) = [%1$d,%2$d)\n" + + "And then will verify all following data in that range.\n", + beg, end); + } + + final long g = guesser.guessNextBCFRecordStart(beg, end); + + ss.close(); + + if (g == end) { + System.out.println("Didn't find any acceptable BCF record in any BGZF block."); + System.exit(1); + } + + if (guesser.isBGZF()) { + System.out.printf( + "Accepted BGZF block at offset %1$#x (%1$d).\n" + + "Accepted BCF record at offset %2$#x (%2$d) therein.\n", + g >> 16, g & 0xffff); + } else { + System.out.printf("Accepted BCF record at offset %1$#x (%1$d).\n", g); + } + } + + public boolean isBGZF() { + return bgzf; + } + + private void cinSeek(long virt) throws IOException { + if (bgzf) { + ((BlockCompressedInputStream) cin).seek(virt); + } else { + ((SeekableStream) cin).seek(virt); + } + } + + /** + * Finds a (virtual in the case of BGZF) BCF record position in the physical position range + * [beg,end). Returns end if no BCF record was found. + */ + public long guessNextBCFRecordStart(long beg, long end) throws IOException { + // Buffer what we need to go through. + + byte[] arr = new byte[bgzf ? BGZF_MAX_BYTES_READ : UNCOMPRESSED_BYTES_NEEDED]; + + this.inFile.seek(beg); + int totalRead = 0; + for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { + final int r = inFile.read(arr, totalRead, left); + if (r < 0) { + break; + } + totalRead += r; + left -= r; + } + arr = Arrays.copyOf(arr, totalRead); + + this.in = new ByteArraySeekableStream(arr); + + final int firstBGZFEnd; + + if (this.bgzf) { + firstBGZFEnd = Math.min((int) (end - beg), 0xffff); + + BlockCompressedInputStream bgzfStream = new BlockCompressedInputStream(this.in); + bgzfStream.setCheckCrcs(true); + this.cin = bgzfStream; + } else { + this.cin = this.in; + + firstBGZFEnd = 0; // Actually unused + } + + // cp: Compressed Position, indexes the entire BGZF input. If + // we have uncompressed BCF, this loop does nothing. + for (int cp = 0; ; ++cp) { + + final int cp0; + final long cp0Virt; + final int blockLen; + + if (this.bgzf) { + final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); + if (psz == null) { + break; + } + + cp0 = cp = psz.pos; + cp0Virt = (long) cp0 << 16; + try { + cinSeek(cp0Virt); + + // This has to catch Throwable, because it's possible to get an + // OutOfMemoryError due to an overly large size. + } catch (Throwable e) { + // Guessed BGZF position incorrectly: try the next guess. + continue; + } + blockLen = psz.size; + } else { + cp0 = 0; // Actually unused + cp0Virt = 0; + blockLen = Math.max(arr.length, UNCOMPRESSED_BYTES_NEEDED); + } + + // up: Uncompressed Position, indexes the data inside the BGZF block. + for (int up = 0; ; ++up) { + final int up0 = up = guessNextBCFPos(cp0Virt, up, blockLen); + + if (up0 < 0) { + // No BCF records found in the BGZF block: try the next BGZF + // block. + break; + } + + // Verification time. + + cinSeek(cp0Virt | up0); + + final PositionalBufferedStream pbIn = new PositionalBufferedStream(cin); + + boolean decodedAny = false; + try { + if (bgzf) { + byte b = 0; + int prevCP = cp0; + while (b < BGZF_BLOCKS_NEEDED_FOR_GUESS && pbIn.peek() != -1) { + bcfCodec.decode(pbIn); + decodedAny = true; + + final int cp2 = (int) (((BlockCompressedInputStream) cin).getFilePointer() >>> 16); + if (cp2 != prevCP) { + // The compressed position changed so we must be in a + // new block. + assert cp2 > prevCP; + cp = cp2; + ++b; + } + } + + // Running out of records to verify is fine as long as we + // verified at least something. It should only happen if we + // couldn't fill the array. + if (b < BGZF_BLOCKS_NEEDED_FOR_GUESS) { + assert arr.length < BGZF_MAX_BYTES_READ; + if (!decodedAny) { + continue; + } + } + } else { + while (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED && pbIn.peek() != -1) { + bcfCodec.decode(pbIn); + decodedAny = true; + } + + // As in the BGZF case. + if (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED) { + assert arr.length < UNCOMPRESSED_BYTES_NEEDED; + if (!decodedAny) { + continue; + } + } + } + + } catch (FileTruncatedException e) { + continue; + } catch (OutOfMemoryError e) { + continue; + } catch (RuntimeEOFException e) { + continue; + } catch (TribbleException e) { + // This is the way in which BCF2Codec reports unexpected EOF. + // Unfortunately, it also reports every other kind of error with + // the same exception. It even wraps IOException in + // TribbleException! + // + // We need to catch EOF in the middle of a record, which can + // happen legitimately if the [beg,end) range is too small and + // cuts off a record. First, require decodedAny, and then, assume + // that this exception means EOF if the stream has hit EOF. + if (!(decodedAny && pbIn.peek() == -1)) { + continue; + } + } + + return this.bgzf ? beg + cp0 << 16 | up0 : beg + up0; + } + if (!this.bgzf) { + break; + } + } + return end; + } + + private int guessNextBCFPos(long cpVirt, int up, int cSize) { + try { + for (; up + SHORTEST_POSSIBLE_BCF_RECORD < cSize; ++up) { + // Note! The BCF2 spec has a table listing the fields and their + // types, but QUAL is misplaced there! It should be before + // n_allele_info, not after n_fmt_sample! The "Putting it all + // together" section shows the correct field order. + + // Check that [0] and [4] are big enough to make sense. + + cinSeek(cpVirt | up); + IOUtils.readFully(cin, buf.array(), 0, 8); + + final long sharedLen = getUInt(0); + final long indivLen = getUInt(4); + if (sharedLen + indivLen < (long) SHORTEST_POSSIBLE_BCF_RECORD) { + continue; + } + + // Check that [8] looks like a valid CHROM field and that [12] is a + // 0-based leftmost coordinate. + + cinSeek(cpVirt | up + 8); + IOUtils.readFully(cin, buf.array(), 0, 8); + + final int chrom = buf.getInt(0); + final int pos = buf.getInt(4); + if (chrom < 0 || chrom >= contigDictionaryLength || pos < 0) { + continue; + } + + // [24] and [26] are lengths and should thus be nonnegative. + + cinSeek(cpVirt | up + 24); + IOUtils.readFully(cin, buf.array(), 0, 4); + final int alleleInfo = buf.getInt(0); + + final int alleleCount = alleleInfo >> 16; + final int infoCount = alleleInfo & 0xffff; + if (alleleCount < 0) // don't check infoCount since it is always nonnegative + { + continue; + } + + // Make sure that [28] matches to the same value in the header. + + cinSeek(cpVirt | up + 28); + IOUtils.readFully(cin, buf.array(), 0, 1); + + final short nSamples = getUByte(0); + if ((int) nSamples != genotypeSampleCount) { + continue; + } + + // Check that the ID string has a sensible type encoding. That is, + // it should claim to be a character string: [32] & 0x0f == 0x07. + // Further, if it has length 15 or more, i.e. [32] & 0xf0 == 0xf0, + // then it should be followed by an integer, i.e. [33] & 0x0f + // should be in the range [1, 3], and the value of that integer + // should be in the range [15, [0] - x) where x is the guaranteed + // number of bytes in the first part of this record (before the + // genotype block). + + cinSeek(cpVirt | up + 32); + IOUtils.readFully(cin, buf.array(), 0, 6); + + final byte idType = buf.get(0); + if ((idType & 0x0f) != 0x07) { + continue; + } + + if ((idType & 0xf0) == 0xf0) { + final byte idLenType = buf.get(1); + final long idLen; + switch (idLenType & 0x0f) { + case 0x01: + idLen = getUByte(2); + break; + case 0x02: + idLen = getUShort(2); + break; + case 0x03: + idLen = getUInt(2); + break; + default: + continue; + } + + if (idLen < 15 || idLen > sharedLen - (4 * 8 + alleleCount + infoCount * 2)) { + continue; + } + } + + // Good enough. + return up; + } + } catch (IOException e) { + // fall through + } + return -1; + } + + private long getUInt(final int idx) { + return (long) buf.getInt(idx) & 0xffffffff; + } + + private short getUByte(final int idx) { + return (short) ((short) buf.get(idx) & 0xff); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java b/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java index 6f96b39..99cad61 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BaseSplitGuesser.java @@ -8,106 +8,114 @@ class BaseSplitGuesser { - protected final static int BGZF_MAGIC = 0x04088b1f; - protected final static int BGZF_MAGIC_SUB = 0x00024342; - protected final static int BGZF_SUB_SIZE = 4 + 2; - - protected SeekableStream in; + protected static final int BGZF_MAGIC = 0x04088b1f; + protected static final int BGZF_MAGIC_SUB = 0x00024342; + protected static final int BGZF_SUB_SIZE = 4 + 2; protected final ByteBuffer buf; + protected SeekableStream in; public BaseSplitGuesser() { buf = ByteBuffer.allocate(8); buf.order(ByteOrder.LITTLE_ENDIAN); } - protected static class PosSize { - public int pos; - public int size; - public PosSize(int p, int s) { pos = p; size = s; } - } - // Gives the compressed size on the side. Returns null if it doesn't find // anything. protected PosSize guessNextBGZFPos(int p, int end) { - try { for (;;) { - for (;;) { - in.seek(p); - IOUtils.readFully(in, buf.array(), 0, 4); - int n = buf.getInt(0); - - if (n == BGZF_MAGIC) - break; - - // Skip ahead a bit more than 1 byte if you can. - if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) - ++p; - else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) - p += 2; - else - p += 3; - - if (p >= end) - return null; - } - // Found what looks like a gzip block header: now get XLEN and - // search for the BGZF subfield. - final int p0 = p; - p += 10; - in.seek(p); - IOUtils.readFully(in, buf.array(), 0, 2); - p += 2; - final int xlen = getUShort(0); - final int subEnd = p + xlen; - - while (p < subEnd) { - IOUtils.readFully(in, buf.array(), 0, 4); - - if (buf.getInt(0) != BGZF_MAGIC_SUB) { - p += 4 + getUShort(2); + try { + for (; ; ) { + for (; ; ) { in.seek(p); - continue; + IOUtils.readFully(in, buf.array(), 0, 4); + int n = buf.getInt(0); + + if (n == BGZF_MAGIC) { + break; + } + + // Skip ahead a bit more than 1 byte if you can. + if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) { + ++p; + } else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) { + p += 2; + } else { + p += 3; + } + + if (p >= end) { + return null; + } } - - // Found it: this is close enough to a BGZF block, make it - // our guess. - - // But find out the size before returning. First, grab bsize: - // we'll need it later. + // Found what looks like a gzip block header: now get XLEN and + // search for the BGZF subfield. + final int p0 = p; + p += 10; + in.seek(p); IOUtils.readFully(in, buf.array(), 0, 2); - int bsize = getUShort(0); + p += 2; + final int xlen = getUShort(0); + final int subEnd = p + xlen; - // Then skip the rest of the subfields. - p += BGZF_SUB_SIZE; while (p < subEnd) { + IOUtils.readFully(in, buf.array(), 0, 4); + + if (buf.getInt(0) != BGZF_MAGIC_SUB) { + p += 4 + getUShort(2); + in.seek(p); + continue; + } + + // Found it: this is close enough to a BGZF block, make it + // our guess. + + // But find out the size before returning. First, grab bsize: + // we'll need it later. + IOUtils.readFully(in, buf.array(), 0, 2); + int bsize = getUShort(0); + + // Then skip the rest of the subfields. + p += BGZF_SUB_SIZE; + while (p < subEnd) { + in.seek(p); + IOUtils.readFully(in, buf.array(), 0, 4); + p += 4 + getUShort(2); + } + if (p != subEnd) { + // Cancel our guess because the xlen field didn't match the + // data. + break; + } + + // Now skip past the compressed data and the CRC-32. + p += bsize - xlen - 19 + 4; in.seek(p); IOUtils.readFully(in, buf.array(), 0, 4); - p += 4 + getUShort(2); + return new PosSize(p0, buf.getInt(0)); } - if (p != subEnd) { - // Cancel our guess because the xlen field didn't match the - // data. - break; - } - - // Now skip past the compressed data and the CRC-32. - p += bsize - xlen - 19 + 4; - in.seek(p); - IOUtils.readFully(in, buf.array(), 0, 4); - return new PosSize(p0, buf.getInt(0)); + // No luck: look for the next gzip block header. Start right after + // where we last saw the identifiers, although we could probably + // safely skip further ahead. (If we find the correct one right + // now, the previous block contained 0x1f8b0804 bytes of data: that + // seems... unlikely.) + p = p0 + 4; } - // No luck: look for the next gzip block header. Start right after - // where we last saw the identifiers, although we could probably - // safely skip further ahead. (If we find the correct one right - // now, the previous block contained 0x1f8b0804 bytes of data: that - // seems... unlikely.) - p = p0 + 4; - - }} catch (IOException e) { + } catch (IOException e) { return null; } } protected int getUShort(final int idx) { - return (int)buf.getShort(idx) & 0xffff; + return (int) buf.getShort(idx) & 0xffff; + } + + protected static class PosSize { + + public int pos; + public int size; + + public PosSize(int p, int s) { + pos = p; + size = s; + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java index 7909310..5e1b8d8 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/CRAMInputFormat.java @@ -23,6 +23,30 @@ public class CRAMInputFormat extends FileInputFormat getContainerOffsets(Configuration conf, Path cramFile) + throws IOException { + SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile); + CramContainerIterator cci = new CramContainerIterator(seekableStream); + List containerOffsets = new ArrayList(); + containerOffsets.add(seekableStream.position()); + while (cci.hasNext()) { + cci.next(); + containerOffsets.add(seekableStream.position()); + } + containerOffsets.add(seekableStream.length()); + return containerOffsets; + } + + private static long nextContainerOffset(List containerOffsets, long position) { + for (long offset : containerOffsets) { + if (offset >= position) { + return offset; + } + } + throw new IllegalStateException( + "Could not find position " + position + " in " + "container offsets: " + containerOffsets); + } + @Override public List getSplits(JobContext job) throws IOException { return getSplits(super.getSplits(job), job.getConfiguration()); @@ -42,45 +66,22 @@ public List getSplits(List splits, Configuration conf) fileToOffsets.put(path, containerOffsets); } long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart()); - long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + - fileSplit.getLength()); + long newEnd = + nextContainerOffset(containerOffsets, fileSplit.getStart() + fileSplit.getLength()); long newLength = newEnd - newStart; if (newLength == 0) { // split is wholly within a container continue; } - FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, - fileSplit.getLocations()); + FileSplit newSplit = + new FileSplit(fileSplit.getPath(), newStart, newLength, fileSplit.getLocations()); newSplits.add(newSplit); } return newSplits; } - private static List getContainerOffsets(Configuration conf, Path cramFile) - throws IOException { - SeekableStream seekableStream = WrapSeekable.openPath(conf, cramFile); - CramContainerIterator cci = new CramContainerIterator(seekableStream); - List containerOffsets = new ArrayList(); - containerOffsets.add(seekableStream.position()); - while (cci.hasNext()) { - cci.next(); - containerOffsets.add(seekableStream.position()); - } - containerOffsets.add(seekableStream.length()); - return containerOffsets; - } - - private static long nextContainerOffset(List containerOffsets, long position) { - for (long offset : containerOffsets) { - if (offset >= position) { - return offset; - } - } - throw new IllegalStateException("Could not find position " + position + " in " + - "container offsets: " + containerOffsets); - } - @Override - public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { RecordReader rr = new CRAMRecordReader(); rr.initialize(split, context); return rr; diff --git a/src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java index abaf734..4b0e841 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/CRAMOutputFormat.java @@ -2,9 +2,8 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -/** Currently this only locks down the value type of the {@link +/** + * Currently this only locks down the value type of the {@link * org.apache.hadoop.mapreduce.OutputFormat}: contains no functionality. */ -public abstract class CRAMOutputFormat - extends FileOutputFormat -{} +public abstract class CRAMOutputFormat extends FileOutputFormat {} diff --git a/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java index 577e97d..49ca269 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/CRAMRecordReader.java @@ -6,8 +6,6 @@ import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.seekablestream.SeekableStream; import java.io.IOException; -import java.net.URI; -import java.nio.file.Paths; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; @@ -31,18 +29,18 @@ public class CRAMRecordReader extends RecordReaderHandles the output stream, writing the header if requested, and provides the {@link + * #writeAlignment} function for subclasses. * - *

Handles the output stream, writing the header if requested, and provides - * the {@link #writeAlignment} function for subclasses.

- *

Note that each file created by this class consists of a fragment of a - * complete CRAM file containing only one or more CRAM containers that do not - * include a CRAM file header, a SAMFileHeader, or a CRAM EOF container.

+ *

Note that each file created by this class consists of a fragment of a complete CRAM file + * containing only one or more CRAM containers that do not include a CRAM file header, a + * SAMFileHeader, or a CRAM EOF container. */ -public abstract class CRAMRecordWriter - extends RecordWriter -{ - // generic ID passed to CRAM code for internal error reporting - private static final String HADOOP_BAM_PART_ID= "Hadoop-BAM-Part"; - private OutputStream origOutput; - private CRAMContainerStreamWriter cramContainerStream = null; - private ReferenceSource refSource = null; - private boolean writeHeader = true; +public abstract class CRAMRecordWriter extends RecordWriter { - /** A SAMFileHeader is read from the input Path. */ - public CRAMRecordWriter( - final Path output, - final Path input, - final boolean writeHeader, - final TaskAttemptContext ctx) throws IOException - { - init( - output, - SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), - writeHeader, ctx); - } + // generic ID passed to CRAM code for internal error reporting + private static final String HADOOP_BAM_PART_ID = "Hadoop-BAM-Part"; + private OutputStream origOutput; + private CRAMContainerStreamWriter cramContainerStream = null; + private ReferenceSource refSource = null; + private boolean writeHeader = true; - public CRAMRecordWriter( - final Path output, final SAMFileHeader header, final boolean writeHeader, - final TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader, ctx); - } + /** A SAMFileHeader is read from the input Path. */ + public CRAMRecordWriter( + final Path output, final Path input, final boolean writeHeader, final TaskAttemptContext ctx) + throws IOException { + init( + output, SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), writeHeader, ctx); + } - // Working around not being able to call a constructor other than as the - // first statement... - private void init( - final Path output, final SAMFileHeader header, final boolean writeHeader, - final TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader, ctx); - } + public CRAMRecordWriter( + final Path output, + final SAMFileHeader header, + final boolean writeHeader, + final TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader, ctx); + } - private void init( - final OutputStream output, final SAMFileHeader header, final boolean writeHeader, - final TaskAttemptContext ctx) - throws IOException - { - origOutput = output; - this.writeHeader = writeHeader; + // Working around not being able to call a constructor other than as the + // first statement... + private void init( + final Path output, + final SAMFileHeader header, + final boolean writeHeader, + final TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader, ctx); + } - final String referenceURI = - ctx.getConfiguration().get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); - refSource = new ReferenceSource(referenceURI == null ? null : - NIOFileUtil.asPath(referenceURI)); + private void init( + final OutputStream output, + final SAMFileHeader header, + final boolean writeHeader, + final TaskAttemptContext ctx) + throws IOException { + origOutput = output; + this.writeHeader = writeHeader; - // A SAMFileHeader must be supplied at CRAMContainerStreamWriter creation time; if - // we don't have one then delay creation until we do - if (header != null) { - cramContainerStream = new CRAMContainerStreamWriter( - origOutput, null, refSource, header, HADOOP_BAM_PART_ID); - if (writeHeader) { - this.writeHeader(header); - } - } - } + final String referenceURI = + ctx.getConfiguration().get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); + refSource = new ReferenceSource(referenceURI == null ? null : NIOFileUtil.asPath(referenceURI)); - @Override public void close(TaskAttemptContext ctx) throws IOException { - cramContainerStream.finish(false); // Close, but suppress CRAM EOF container - origOutput.close(); // And close the original output. + // A SAMFileHeader must be supplied at CRAMContainerStreamWriter creation time; if + // we don't have one then delay creation until we do + if (header != null) { + cramContainerStream = + new CRAMContainerStreamWriter(origOutput, null, refSource, header, HADOOP_BAM_PART_ID); + if (writeHeader) { + this.writeHeader(header); + } } + } - protected void writeAlignment(final SAMRecord rec) { - if (null == cramContainerStream) { - final SAMFileHeader header = rec.getHeader(); - if (header == null) { - throw new RuntimeException("Cannot write record to CRAM: null header in SAM record"); - } - if (writeHeader) { - this.writeHeader(header); - } - cramContainerStream = new CRAMContainerStreamWriter( - origOutput, null, refSource, header, HADOOP_BAM_PART_ID); - } - cramContainerStream.writeAlignment(rec); - } + @Override + public void close(TaskAttemptContext ctx) throws IOException { + cramContainerStream.finish(false); // Close, but suppress CRAM EOF container + origOutput.close(); // And close the original output. + } - private void writeHeader(final SAMFileHeader header) { - cramContainerStream.writeHeader(header); + protected void writeAlignment(final SAMRecord rec) { + if (null == cramContainerStream) { + final SAMFileHeader header = rec.getHeader(); + if (header == null) { + throw new RuntimeException("Cannot write record to CRAM: null header in SAM record"); + } + if (writeHeader) { + this.writeHeader(header); + } + cramContainerStream = + new CRAMContainerStreamWriter(origOutput, null, refSource, header, HADOOP_BAM_PART_ID); } + cramContainerStream.writeAlignment(rec); + } + + private void writeHeader(final SAMFileHeader header) { + cramContainerStream.writeHeader(header); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FastaInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/FastaInputFormat.java index 19768eb..6037c32 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FastaInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FastaInputFormat.java @@ -29,7 +29,6 @@ import java.util.Collections; import java.util.Comparator; import java.util.List; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -47,343 +46,335 @@ import org.slf4j.LoggerFactory; /** - * Reads the FASTA reference sequence format. - * Key: sequence description and position offset, delimited by ':' characters. - * Value: a ReferenceFragment object representing the entry. + * Reads the FASTA reference sequence format. Key: sequence description and position offset, + * delimited by ':' characters. Value: a ReferenceFragment object representing the entry. * - * Note: here sections in the input file are assumed to be delimited by single - * line descriptions that start with '>'. + *

Note: here sections in the input file are assumed to be delimited by single line descriptions + * that start with '>'. */ -public class FastaInputFormat extends FileInputFormat -{ - private static final Logger logger = LoggerFactory.getLogger(FastaInputFormat.class); - public static final Charset UTF8 = Charset.forName("UTF8"); - - @Override public List getSplits(JobContext job) throws IOException - { - - // Note: We generate splits that correspond to different sections in the FASTA - // input (which here are called "chromosomes", delimited by '>' and - // followed by a single line description. - // Some locality is preserved since the locations are formed from the input - // splits, although no special attention is given to this issues (FASTA files - // are assumed to be smallish). - // The splits are generated on the client. In the future the split generation - // should be only performed once and an index file stored inside HDFS for - // peformance reasons. Currently this is not attempted (again: FASTA files - // aren't all that big). - - // we first make sure we are given only a single file - - List splits = super.getSplits(job); - - // first sort by input path - Collections.sort(splits, new Comparator() - { - public int compare(InputSplit a, InputSplit b) { - FileSplit fa = (FileSplit)a, fb = (FileSplit)b; - return fa.getPath().compareTo(fb.getPath()); - } - }); - - for (int i = 0; i < splits.size()-1; i++) { - FileSplit fa = (FileSplit)splits.get(i); - FileSplit fb = (FileSplit)splits.get(i+1); - - if(fa.getPath().compareTo(fb.getPath()) != 0) - throw new IOException("FastaInputFormat assumes single FASTA input file!"); +public class FastaInputFormat extends FileInputFormat { + + public static final Charset UTF8 = Charset.forName("UTF8"); + private static final Logger logger = LoggerFactory.getLogger(FastaInputFormat.class); + + @Override + public List getSplits(JobContext job) throws IOException { + + // Note: We generate splits that correspond to different sections in the FASTA + // input (which here are called "chromosomes", delimited by '>' and + // followed by a single line description. + // Some locality is preserved since the locations are formed from the input + // splits, although no special attention is given to this issues (FASTA files + // are assumed to be smallish). + // The splits are generated on the client. In the future the split generation + // should be only performed once and an index file stored inside HDFS for + // peformance reasons. Currently this is not attempted (again: FASTA files + // aren't all that big). + + // we first make sure we are given only a single file + + List splits = super.getSplits(job); + + // first sort by input path + Collections.sort( + splits, + new Comparator() { + public int compare(InputSplit a, InputSplit b) { + FileSplit fa = (FileSplit) a, fb = (FileSplit) b; + return fa.getPath().compareTo(fb.getPath()); + } + }); + + for (int i = 0; i < splits.size() - 1; i++) { + FileSplit fa = (FileSplit) splits.get(i); + FileSplit fb = (FileSplit) splits.get(i + 1); + + if (fa.getPath().compareTo(fb.getPath()) != 0) { + throw new IOException("FastaInputFormat assumes single FASTA input file!"); + } + } + + // now we are sure we only have one FASTA input file + + final List newSplits = new ArrayList(splits.size()); + FileSplit fileSplit = (FileSplit) splits.get(0); + Path path = fileSplit.getPath(); + + FileSystem fs = path.getFileSystem(job.getConfiguration()); + FSDataInputStream fis = fs.open(path); + byte[] buffer = new byte[1024]; + + long byte_counter = 0; + long prev_chromosome_byte_offset = 0; + boolean first_chromosome = true; + + for (int j = 0; j < splits.size(); j++) { + FileSplit origsplit = (FileSplit) splits.get(j); + + while (byte_counter < origsplit.getStart() + origsplit.getLength()) { + long bytes_read = + fis.read( + byte_counter, + buffer, + 0, + (int) + Math.min( + buffer.length, + origsplit.getStart() + origsplit.getLength() - byte_counter)); + if (logger.isDebugEnabled()) { + logger.debug("bytes_read: {} of {} splits", bytes_read, splits.size()); + } + if (bytes_read > 0) { + for (int i = 0; i < bytes_read; i++) { + if (buffer[i] == (byte) '>') { + if (logger.isDebugEnabled()) { + logger.debug("found chromosome at position {}", byte_counter + i); + } + + if (!first_chromosome) { + FileSplit fsplit = + new FileSplit( + path, + prev_chromosome_byte_offset, + byte_counter + i - 1 - prev_chromosome_byte_offset, + origsplit.getLocations()); + + if (logger.isDebugEnabled()) { + logger.debug( + "adding split: start: {}, length: {}", fsplit.getStart(), fsplit.getLength()); + } + newSplits.add(fsplit); + } + first_chromosome = false; + prev_chromosome_byte_offset = byte_counter + i; } - - // now we are sure we only have one FASTA input file - - final List newSplits = new ArrayList(splits.size()); - FileSplit fileSplit = (FileSplit)splits.get(0); - Path path = fileSplit.getPath(); - - FileSystem fs = path.getFileSystem(job.getConfiguration()); - FSDataInputStream fis = fs.open(path); - byte[] buffer = new byte[1024]; - - long byte_counter = 0; - long prev_chromosome_byte_offset = 0; - boolean first_chromosome = true; - - for(int j = 0; j < splits.size(); j++) { - FileSplit origsplit = (FileSplit)splits.get(j); - - while(byte_counter < origsplit.getStart()+origsplit.getLength()) { - long bytes_read = fis.read(byte_counter, buffer, 0, (int)Math.min(buffer.length, - origsplit.getStart()+origsplit.getLength()- byte_counter)); - if (logger.isDebugEnabled()) { - logger.debug("bytes_read: {} of {} splits", bytes_read, splits.size()); - } - if(bytes_read > 0) { - for(int i=0;i') { - if (logger.isDebugEnabled()) { - logger.debug("found chromosome at position {}", byte_counter + i); - } - - if(!first_chromosome) { - FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter + i-1 - prev_chromosome_byte_offset, origsplit.getLocations()); - - if (logger.isDebugEnabled()) { - logger.debug("adding split: start: {}, length: {}", fsplit.getStart(), fsplit.getLength()); - } - newSplits.add(fsplit); - } - first_chromosome = false; - prev_chromosome_byte_offset = byte_counter + i; - } - } - byte_counter += bytes_read; - } - } - - if(j == splits.size()-1) { - FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter - prev_chromosome_byte_offset, origsplit.getLocations()); - newSplits.add(fsplit); - if (logger.isDebugEnabled()) { - logger.debug("adding split: {}", fsplit); - } - break; - } - } - - return newSplits; - } - - public static class FastaRecordReader extends RecordReader - { - - // start: first valid data index - private long start; - // end: first index value beyond the slice, i.e. slice is in range [start,end) - private long end; - // pos: current position in file - private long pos; - // file: the file being read - private Path file; - - // current_split_pos: the current (chromosome) position within the split - private int current_split_pos; - // current_split_indexseq: the description/chromosome name - private String current_split_indexseq = null; - - private LineReader lineReader; - private InputStream inputStream; - private Text currentKey = new Text(); - private ReferenceFragment currentValue = new ReferenceFragment(); - - private Text buffer = new Text(); - - // How long can a FASTA line get? - public static final int MAX_LINE_LENGTH = 20000; - - public FastaRecordReader(Configuration conf, FileSplit split) throws IOException - { - setConf(conf); - file = split.getPath(); - start = split.getStart(); - end = start + split.getLength(); - current_split_pos = 1; - - FileSystem fs = file.getFileSystem(conf); - FSDataInputStream fileIn = fs.open(file); - - CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); - CompressionCodec codec = codecFactory.getCodec(file); - - if (codec == null) // no codec. Uncompressed file. - { - positionAtFirstRecord(fileIn); - inputStream = fileIn; - } - else - { // compressed file - if (start != 0) - throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); - - inputStream = codec.createInputStream(fileIn); - end = Long.MAX_VALUE; // read until the end of the file - } - - lineReader = new LineReader(inputStream); - } - - /* - * Position the input stream at the start of the first record. - */ - private void positionAtFirstRecord(FSDataInputStream stream) throws IOException - { - if (start > 0) - { - stream.seek(start); - } - - // we are now in a new chromosome/fragment, so read its name/index sequence - // and reset position counter - - // index sequence - LineReader reader = new LineReader(stream); - int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - - current_split_indexseq = buffer.toString(); - // now get rid of '>' character - current_split_indexseq = current_split_indexseq.substring(1,current_split_indexseq.length()); - - // initialize position counter - current_split_pos = 1; - - if (logger.isDebugEnabled()) { - logger.debug("read index sequence: {}", current_split_indexseq); - } - start = start + bytesRead; - stream.seek(start); - pos = start; - } - - protected void setConf(Configuration conf) - { - } - - /** - * Added to use mapreduce API. - */ - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException - { - } - - /** - * Added to use mapreduce API. - */ - public Text getCurrentKey() - { - return currentKey; - } - - /** - * Added to use mapreduce API. - */ - public ReferenceFragment getCurrentValue() - { - return currentValue; - } - - /** - * Added to use mapreduce API. - */ - public boolean nextKeyValue() throws IOException, InterruptedException - { - return next(currentKey, currentValue); - } - - /** - * Close this RecordReader to future operations. - */ - public void close() throws IOException - { - inputStream.close(); - } - - /** - * Create an object of the appropriate type to be used as a key. - */ - public Text createKey() - { - return new Text(); - } - - /** - * Create an object of the appropriate type to be used as a value. - */ - public ReferenceFragment createValue() - { - return new ReferenceFragment(); - } - - /** - * Returns the current position in the input. - */ - public long getPos() { return pos; } - - /** - * How much of the input has the RecordReader consumed i.e. - */ - public float getProgress() - { - if (start == end) - return 1.0f; - else - return Math.min(1.0f, (pos - start) / (float)(end - start)); - } - - public String makePositionMessage(long pos) - { - return file.toString() + ":" + pos; - } - - public String makePositionMessage() - { - return file.toString() + ":" + pos; - } - - /** - * Reads the next key/value pair from the input for processing. - */ - public boolean next(Text key, ReferenceFragment value) throws IOException - { - if (pos >= end) - return false; // past end of slice - - int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); - pos += bytesRead; - if (bytesRead >= MAX_LINE_LENGTH) - throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + Text.decode(buffer.getBytes(), 0, 500)); - else if (bytesRead <= 0) - return false; // EOF - else - { - scanFastaLine(buffer, key, value); - current_split_pos += bytesRead; - return true; - } - } - - private void scanFastaLine(Text line, Text key, ReferenceFragment fragment) - { - // Build the key. We concatenate the chromosome/fragment descripion and - // the start position of the FASTA sequence line, replacing the tabs with colons. - key.clear(); - - key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length); - key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length); - // replace tabs with : - byte[] bytes = key.getBytes(); - int temporaryEnd = key.getLength(); - for (int i = 0; i < temporaryEnd; ++i) - if (bytes[i] == '\t') - bytes[i] = ':'; - - fragment.clear(); - fragment.setPosition(current_split_pos); - fragment.setIndexSequence(current_split_indexseq); - fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length); - } - } - - @Override - public boolean isSplitable(JobContext context, Path path) - { - CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); - return codec == null; - } - - public RecordReader createRecordReader( - InputSplit genericSplit, - TaskAttemptContext context) throws IOException, InterruptedException - { - context.setStatus(genericSplit.toString()); - return new FastaRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat - } + } + byte_counter += bytes_read; + } + } + + if (j == splits.size() - 1) { + FileSplit fsplit = + new FileSplit( + path, + prev_chromosome_byte_offset, + byte_counter - prev_chromosome_byte_offset, + origsplit.getLocations()); + newSplits.add(fsplit); + if (logger.isDebugEnabled()) { + logger.debug("adding split: {}", fsplit); + } + break; + } + } + + return newSplits; + } + + @Override + public boolean isSplitable(JobContext context, Path path) { + CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); + return codec == null; + } + + public RecordReader createRecordReader( + InputSplit genericSplit, TaskAttemptContext context) + throws IOException, InterruptedException { + context.setStatus(genericSplit.toString()); + return new FastaRecordReader( + context.getConfiguration(), + (FileSplit) genericSplit); // cast as per example in TextInputFormat + } + + public static class FastaRecordReader extends RecordReader { + + // How long can a FASTA line get? + public static final int MAX_LINE_LENGTH = 20000; + // start: first valid data index + private long start; + // end: first index value beyond the slice, i.e. slice is in range [start,end) + private long end; + // pos: current position in file + private long pos; + // file: the file being read + private Path file; + // current_split_pos: the current (chromosome) position within the split + private int current_split_pos; + // current_split_indexseq: the description/chromosome name + private String current_split_indexseq = null; + private LineReader lineReader; + private InputStream inputStream; + private Text currentKey = new Text(); + private ReferenceFragment currentValue = new ReferenceFragment(); + private Text buffer = new Text(); + + public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { + setConf(conf); + file = split.getPath(); + start = split.getStart(); + end = start + split.getLength(); + current_split_pos = 1; + + FileSystem fs = file.getFileSystem(conf); + FSDataInputStream fileIn = fs.open(file); + + CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); + CompressionCodec codec = codecFactory.getCodec(file); + + if (codec == null) // no codec. Uncompressed file. + { + positionAtFirstRecord(fileIn); + inputStream = fileIn; + } else { // compressed file + if (start != 0) { + throw new RuntimeException( + "Start position for compressed file is not 0! (found " + start + ")"); + } + + inputStream = codec.createInputStream(fileIn); + end = Long.MAX_VALUE; // read until the end of the file + } + + lineReader = new LineReader(inputStream); + } + + /* + * Position the input stream at the start of the first record. + */ + private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { + if (start > 0) { + stream.seek(start); + } + + // we are now in a new chromosome/fragment, so read its name/index sequence + // and reset position counter + + // index sequence + LineReader reader = new LineReader(stream); + int bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + + current_split_indexseq = buffer.toString(); + // now get rid of '>' character + current_split_indexseq = current_split_indexseq.substring(1, current_split_indexseq.length()); + + // initialize position counter + current_split_pos = 1; + + if (logger.isDebugEnabled()) { + logger.debug("read index sequence: {}", current_split_indexseq); + } + start = start + bytesRead; + stream.seek(start); + pos = start; + } + + protected void setConf(Configuration conf) {} + + /** Added to use mapreduce API. */ + public void initialize(InputSplit split, TaskAttemptContext context) + throws IOException, InterruptedException {} + + /** Added to use mapreduce API. */ + public Text getCurrentKey() { + return currentKey; + } + + /** Added to use mapreduce API. */ + public ReferenceFragment getCurrentValue() { + return currentValue; + } + + /** Added to use mapreduce API. */ + public boolean nextKeyValue() throws IOException, InterruptedException { + return next(currentKey, currentValue); + } + + /** Close this RecordReader to future operations. */ + public void close() throws IOException { + inputStream.close(); + } + + /** Create an object of the appropriate type to be used as a key. */ + public Text createKey() { + return new Text(); + } + + /** Create an object of the appropriate type to be used as a value. */ + public ReferenceFragment createValue() { + return new ReferenceFragment(); + } + + /** Returns the current position in the input. */ + public long getPos() { + return pos; + } + + /** How much of the input has the RecordReader consumed i.e. */ + public float getProgress() { + if (start == end) { + return 1.0f; + } else { + return Math.min(1.0f, (pos - start) / (float) (end - start)); + } + } + + public String makePositionMessage(long pos) { + return file.toString() + ":" + pos; + } + + public String makePositionMessage() { + return file.toString() + ":" + pos; + } + + /** Reads the next key/value pair from the input for processing. */ + public boolean next(Text key, ReferenceFragment value) throws IOException { + if (pos >= end) { + return false; // past end of slice + } + + int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); + pos += bytesRead; + if (bytesRead >= MAX_LINE_LENGTH) { + throw new RuntimeException( + "found abnormally large line (length " + + bytesRead + + ") at " + + makePositionMessage(pos - bytesRead) + + ": " + + Text.decode(buffer.getBytes(), 0, 500)); + } else if (bytesRead <= 0) { + return false; // EOF + } else { + scanFastaLine(buffer, key, value); + current_split_pos += bytesRead; + return true; + } + } + + private void scanFastaLine(Text line, Text key, ReferenceFragment fragment) { + // Build the key. We concatenate the chromosome/fragment descripion and + // the start position of the FASTA sequence line, replacing the tabs with colons. + key.clear(); + + key.append( + current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length); + key.append( + Integer.toString(current_split_pos).getBytes(UTF8), + 0, + Integer.toString(current_split_pos).getBytes(UTF8).length); + // replace tabs with : + byte[] bytes = key.getBytes(); + int temporaryEnd = key.getLength(); + for (int i = 0; i < temporaryEnd; ++i) { + if (bytes[i] == '\t') { + bytes[i] = ':'; + } + } + + fragment.clear(); + fragment.setPosition(current_split_pos); + fragment.setIndexSequence(current_split_indexseq); + fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java index 7c008c7..f9131ed 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java @@ -22,386 +22,367 @@ package org.seqdoop.hadoop_bam; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.util.regex.*; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.compress.*; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.io.compress.*; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import java.io.InputStream; -import java.io.IOException; -import java.io.EOFException; - -import java.util.regex.*; - +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding; import org.seqdoop.hadoop_bam.util.ConfHelper; -public class FastqInputFormat extends FileInputFormat -{ - public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding"; - public static final String CONF_FILTER_FAILED_QC = "hbam.fastq-input.filter-failed-qc"; - public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger"; - - public static class FastqRecordReader extends RecordReader - { - /* - * fastq format: - * := + - * := @\n\n+[]\n\n - * := [A-Za-z0-9_.:-]+ - * := [A-Za-z\n\.~]+ - * := [!-~\n]+ - * - * LP: this format is broken, no? You can have multi-line sequence and quality strings, - * and the quality encoding includes '@' in its valid character range. So how should one - * distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line - * quality string? - * - * For now I'm going to assume single-line sequences. This works for our sequencing - * application. We'll see if someone complains in other applications. - */ - - // start: first valid data index - private long start; - // end: first index value beyond the slice, i.e. slice is in range [start,end) - private long end; - // pos: current position in file - private long pos; - // file: the file being read - private Path file; - - private LineReader lineReader; - private InputStream inputStream; - private Text currentKey = new Text(); - private SequencedFragment currentValue = new SequencedFragment(); - - /* If true, will scan the identifier for read data as specified in the Casava - * users' guide v1.8: - * @:::::: ::: - * After the first name that doesn't match lookForIlluminaIdentifier will be - * set to false and no further scanning will be done. - */ - private boolean lookForIlluminaIdentifier = true; - private static final Pattern ILLUMINA_PATTERN = Pattern.compile("([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)"); - - private Text buffer = new Text(); - - private BaseQualityEncoding qualityEncoding; - private boolean filterFailedQC = false; - - // How long can a read get? - private static final int MAX_LINE_LENGTH = 10000; - - public FastqRecordReader(Configuration conf, FileSplit split) throws IOException - { - setConf(conf); - file = split.getPath(); - start = split.getStart(); - end = start + split.getLength(); - - FileSystem fs = file.getFileSystem(conf); - FSDataInputStream fileIn = fs.open(file); - - CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); - CompressionCodec codec = codecFactory.getCodec(file); - - if (codec == null) // no codec. Uncompressed file. - { - positionAtFirstRecord(fileIn); - inputStream = fileIn; - } - else - { // compressed file - if (start != 0) - throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); - - inputStream = codec.createInputStream(fileIn); - end = Long.MAX_VALUE; // read until the end of the file - } - - lineReader = new LineReader(inputStream); - } - - protected void setConf(Configuration conf) - { - String encoding = - conf.get(FastqInputFormat.CONF_BASE_QUALITY_ENCODING, - conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, - FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT)); - - if ("illumina".equals(encoding)) - qualityEncoding = BaseQualityEncoding.Illumina; - else if ("sanger".equals(encoding)) - qualityEncoding = BaseQualityEncoding.Sanger; - else - throw new RuntimeException("Unknown input base quality encoding value " + encoding); - - filterFailedQC = ConfHelper.parseBoolean( - conf.get(FastqInputFormat.CONF_FILTER_FAILED_QC, - conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), - false); - } - - /* - * Position the input stream at the start of the first record. - */ - private void positionAtFirstRecord(FSDataInputStream stream) throws IOException - { - if (start > 0) - { - // Advance to the start of the first record - // We use a temporary LineReader to read lines until we find the - // position of the right one. We then seek the file to that position. - stream.seek(start); - LineReader reader = new LineReader(stream); - - int bytesRead = 0; - do - { - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@')) - start += bytesRead; - else - { - // line starts with @. Read two more and verify that it starts with a + - // - // If this isn't the start of a record, we want to backtrack to its end - long backtrackPosition = start + bytesRead; - - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') - break; // all good! - else - { - // backtrack to the end of the record we thought was the start. - start = backtrackPosition; - stream.seek(start); - reader = new LineReader(stream); - } - } - } while (bytesRead > 0); - - stream.seek(start); - } - // else - // if start == 0 we presume it starts with a valid fastq record - pos = start; - } - - /** - * Added to use mapreduce API. - */ - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException - { - } - - /** - * Added to use mapreduce API. - */ - public Text getCurrentKey() - { - return currentKey; - } - - /** - * Added to use mapreduce API. - */ - public SequencedFragment getCurrentValue() - { - return currentValue; - } - - /** - * Added to use mapreduce API. - */ - public boolean nextKeyValue() throws IOException, InterruptedException - { - return next(currentKey, currentValue); - } - - /** - * Close this RecordReader to future operations. - */ - public void close() throws IOException - { - inputStream.close(); - } - - /** - * Create an object of the appropriate type to be used as a key. - */ - public Text createKey() - { - return new Text(); - } - - /** - * Create an object of the appropriate type to be used as a value. - */ - public SequencedFragment createValue() - { - return new SequencedFragment(); - } - - /** - * Returns the current position in the input. - */ - public long getPos() { return pos; } - - /** - * How much of the input has the RecordReader consumed i.e. - */ - public float getProgress() - { - if (start == end) - return 1.0f; - else - return Math.min(1.0f, (pos - start) / (float)(end - start)); - } - - public String makePositionMessage() - { - return file.toString() + ":" + pos; - } - - protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException - { - // ID line - long skipped = lineReader.skip(1); // skip @ - pos += skipped; - if (skipped == 0) - return false; // EOF - - // ID - readLineInto(key); - // sequence - value.clear(); - readLineInto(value.getSequence()); - readLineInto(buffer); - if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') - throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); - readLineInto(value.getQuality()); - - // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false - lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); - if (!lookForIlluminaIdentifier) - scanNameForReadNumber(key, value); - return true; - } - - - /** - * Reads the next key/value pair from the input for processing. - */ - public boolean next(Text key, SequencedFragment value) throws IOException - { - if (pos >= end) - return false; // past end of slice - try - { - boolean gotData; - boolean goodRecord; - do { - gotData = lowLevelFastqRead(key, value); - goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); - } while (gotData && !goodRecord); - - if (goodRecord) // goodRecord falso also when we couldn't read any more data - { - if (qualityEncoding == BaseQualityEncoding.Illumina) - { - try - { - // convert illumina to sanger scale - SequencedFragment.convertQuality(value.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); - } catch (FormatException e) { - throw new FormatException(e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key); - } - } - else // sanger qualities. - { - int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(), BaseQualityEncoding.Sanger); - if (outOfRangeElement >= 0) - { - throw new FormatException("fastq base quality score out of range for Sanger Phred+33 format (found " + - (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + - "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" + - "Position: " + makePositionMessage() + "; Sequence ID: " + key); - } - } - } - return goodRecord; - } - catch (EOFException e) { - throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage() + ". Id: " + key.toString()); - } - } - - private void scanNameForReadNumber(Text name, SequencedFragment fragment) - { - // look for a /[0-9] at the end of the name - if (name.getLength() >= 2) - { - byte[] bytes = name.getBytes(); - int last = name.getLength() - 1; - - if (bytes[last-1] == '/' && bytes[last] >= '0' && bytes[last] <= '9') - fragment.setRead(bytes[last] - '0'); - } - } - - private boolean scanIlluminaId(Text name, SequencedFragment fragment) - { - Matcher m = ILLUMINA_PATTERN.matcher(name.toString()); - boolean matches = m.matches(); - if (matches) - { - fragment.setInstrument(m.group(1)); - fragment.setRunNumber(Integer.parseInt(m.group(2))); - fragment.setFlowcellId(m.group(3)); - fragment.setLane(Integer.parseInt(m.group(4))); - fragment.setTile(Integer.parseInt(m.group(5))); - fragment.setXpos(Integer.parseInt(m.group(6))); - fragment.setYpos(Integer.parseInt(m.group(7))); - fragment.setRead(Integer.parseInt(m.group(8))); - fragment.setFilterPassed("N".equals(m.group(9))); - fragment.setControlNumber(Integer.parseInt(m.group(10))); - fragment.setIndexSequence(m.group(11)); - } - return matches; - } - - private int readLineInto(Text dest) throws EOFException, IOException - { - int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH); - if (bytesRead <= 0) - throw new EOFException(); - pos += bytesRead; - return bytesRead; - } - } - - @Override - public boolean isSplitable(JobContext context, Path path) - { - CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); - return codec == null; - } - - public RecordReader createRecordReader( - InputSplit genericSplit, - TaskAttemptContext context) throws IOException, InterruptedException - { - context.setStatus(genericSplit.toString()); - return new FastqRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat - } +public class FastqInputFormat extends FileInputFormat { + + public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-input.base-quality-encoding"; + public static final String CONF_FILTER_FAILED_QC = "hbam.fastq-input.filter-failed-qc"; + public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger"; + + @Override + public boolean isSplitable(JobContext context, Path path) { + CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); + return codec == null; + } + + public RecordReader createRecordReader( + InputSplit genericSplit, TaskAttemptContext context) + throws IOException, InterruptedException { + context.setStatus(genericSplit.toString()); + return new FastqRecordReader( + context.getConfiguration(), + (FileSplit) genericSplit); // cast as per example in TextInputFormat + } + + public static class FastqRecordReader extends RecordReader { + /* + * fastq format: + * := + + * := @\n\n+[]\n\n + * := [A-Za-z0-9_.:-]+ + * := [A-Za-z\n\.~]+ + * := [!-~\n]+ + * + * LP: this format is broken, no? You can have multi-line sequence and quality strings, + * and the quality encoding includes '@' in its valid character range. So how should one + * distinguish between \n@ as a record delimiter and and \n@ as part of a multi-line + * quality string? + * + * For now I'm going to assume single-line sequences. This works for our sequencing + * application. We'll see if someone complains in other applications. + */ + + private static final Pattern ILLUMINA_PATTERN = + Pattern.compile( + "([^:]+):(\\d+):([^:]*):(\\d+):(\\d+):(-?\\d+):(-?\\d+)\\s+([123]):([YN]):(\\d+):(.*)"); + // How long can a read get? + private static final int MAX_LINE_LENGTH = 10000; + // start: first valid data index + private long start; + // end: first index value beyond the slice, i.e. slice is in range [start,end) + private long end; + // pos: current position in file + private long pos; + // file: the file being read + private Path file; + private LineReader lineReader; + private InputStream inputStream; + private Text currentKey = new Text(); + private SequencedFragment currentValue = new SequencedFragment(); + /* If true, will scan the identifier for read data as specified in the Casava + * users' guide v1.8: + * @:::::: ::: + * After the first name that doesn't match lookForIlluminaIdentifier will be + * set to false and no further scanning will be done. + */ + private boolean lookForIlluminaIdentifier = true; + private Text buffer = new Text(); + private BaseQualityEncoding qualityEncoding; + private boolean filterFailedQC = false; + + public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { + setConf(conf); + file = split.getPath(); + start = split.getStart(); + end = start + split.getLength(); + + FileSystem fs = file.getFileSystem(conf); + FSDataInputStream fileIn = fs.open(file); + + CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); + CompressionCodec codec = codecFactory.getCodec(file); + + if (codec == null) // no codec. Uncompressed file. + { + positionAtFirstRecord(fileIn); + inputStream = fileIn; + } else { // compressed file + if (start != 0) { + throw new RuntimeException( + "Start position for compressed file is not 0! (found " + start + ")"); + } + + inputStream = codec.createInputStream(fileIn); + end = Long.MAX_VALUE; // read until the end of the file + } + + lineReader = new LineReader(inputStream); + } + + protected void setConf(Configuration conf) { + String encoding = + conf.get( + FastqInputFormat.CONF_BASE_QUALITY_ENCODING, + conf.get( + FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, + FastqInputFormat.CONF_BASE_QUALITY_ENCODING_DEFAULT)); + + if ("illumina".equals(encoding)) { + qualityEncoding = BaseQualityEncoding.Illumina; + } else if ("sanger".equals(encoding)) { + qualityEncoding = BaseQualityEncoding.Sanger; + } else { + throw new RuntimeException("Unknown input base quality encoding value " + encoding); + } + + filterFailedQC = + ConfHelper.parseBoolean( + conf.get( + FastqInputFormat.CONF_FILTER_FAILED_QC, + conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), + false); + } + + /* + * Position the input stream at the start of the first record. + */ + private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { + if (start > 0) { + // Advance to the start of the first record + // We use a temporary LineReader to read lines until we find the + // position of the right one. We then seek the file to that position. + stream.seek(start); + LineReader reader = new LineReader(stream); + + int bytesRead = 0; + do { + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + if (bytesRead > 0 && (buffer.getLength() <= 0 || buffer.getBytes()[0] != '@')) { + start += bytesRead; + } else { + // line starts with @. Read two more and verify that it starts with a + + // + // If this isn't the start of a record, we want to backtrack to its end + long backtrackPosition = start + bytesRead; + + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { + break; // all good! + } else { + // backtrack to the end of the record we thought was the start. + start = backtrackPosition; + stream.seek(start); + reader = new LineReader(stream); + } + } + } while (bytesRead > 0); + + stream.seek(start); + } + // else + // if start == 0 we presume it starts with a valid fastq record + pos = start; + } + + /** Added to use mapreduce API. */ + public void initialize(InputSplit split, TaskAttemptContext context) + throws IOException, InterruptedException {} + + /** Added to use mapreduce API. */ + public Text getCurrentKey() { + return currentKey; + } + + /** Added to use mapreduce API. */ + public SequencedFragment getCurrentValue() { + return currentValue; + } + + /** Added to use mapreduce API. */ + public boolean nextKeyValue() throws IOException, InterruptedException { + return next(currentKey, currentValue); + } + + /** Close this RecordReader to future operations. */ + public void close() throws IOException { + inputStream.close(); + } + + /** Create an object of the appropriate type to be used as a key. */ + public Text createKey() { + return new Text(); + } + + /** Create an object of the appropriate type to be used as a value. */ + public SequencedFragment createValue() { + return new SequencedFragment(); + } + + /** Returns the current position in the input. */ + public long getPos() { + return pos; + } + + /** How much of the input has the RecordReader consumed i.e. */ + public float getProgress() { + if (start == end) { + return 1.0f; + } else { + return Math.min(1.0f, (pos - start) / (float) (end - start)); + } + } + + public String makePositionMessage() { + return file.toString() + ":" + pos; + } + + protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { + // ID line + long skipped = lineReader.skip(1); // skip @ + pos += skipped; + if (skipped == 0) { + return false; // EOF + } + + // ID + readLineInto(key); + // sequence + value.clear(); + readLineInto(value.getSequence()); + readLineInto(buffer); + if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') { + throw new RuntimeException( + "unexpected fastq line separating sequence and quality at " + + makePositionMessage() + + ". Line: " + + buffer + + ". \nSequence ID: " + + key); + } + readLineInto(value.getQuality()); + + // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will + // be set to false + lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); + if (!lookForIlluminaIdentifier) { + scanNameForReadNumber(key, value); + } + return true; + } + + /** Reads the next key/value pair from the input for processing. */ + public boolean next(Text key, SequencedFragment value) throws IOException { + if (pos >= end) { + return false; // past end of slice + } + try { + boolean gotData; + boolean goodRecord; + do { + gotData = lowLevelFastqRead(key, value); + goodRecord = + gotData + && (!filterFailedQC + || value.getFilterPassed() == null + || value.getFilterPassed()); + } while (gotData && !goodRecord); + + if (goodRecord) // goodRecord falso also when we couldn't read any more data + { + if (qualityEncoding == BaseQualityEncoding.Illumina) { + try { + // convert illumina to sanger scale + SequencedFragment.convertQuality( + value.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); + } catch (FormatException e) { + throw new FormatException( + e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key); + } + } else // sanger qualities. + { + int outOfRangeElement = + SequencedFragment.verifyQuality(value.getQuality(), BaseQualityEncoding.Sanger); + if (outOfRangeElement >= 0) { + throw new FormatException( + "fastq base quality score out of range for Sanger Phred+33 format (found " + + (value.getQuality().getBytes()[outOfRangeElement] + - FormatConstants.SANGER_OFFSET) + + ").\n" + + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" + + "Position: " + + makePositionMessage() + + "; Sequence ID: " + + key); + } + } + } + return goodRecord; + } catch (EOFException e) { + throw new RuntimeException( + "unexpected end of file in fastq record at " + + makePositionMessage() + + ". Id: " + + key.toString()); + } + } + + private void scanNameForReadNumber(Text name, SequencedFragment fragment) { + // look for a /[0-9] at the end of the name + if (name.getLength() >= 2) { + byte[] bytes = name.getBytes(); + int last = name.getLength() - 1; + + if (bytes[last - 1] == '/' && bytes[last] >= '0' && bytes[last] <= '9') { + fragment.setRead(bytes[last] - '0'); + } + } + } + + private boolean scanIlluminaId(Text name, SequencedFragment fragment) { + Matcher m = ILLUMINA_PATTERN.matcher(name.toString()); + boolean matches = m.matches(); + if (matches) { + fragment.setInstrument(m.group(1)); + fragment.setRunNumber(Integer.parseInt(m.group(2))); + fragment.setFlowcellId(m.group(3)); + fragment.setLane(Integer.parseInt(m.group(4))); + fragment.setTile(Integer.parseInt(m.group(5))); + fragment.setXpos(Integer.parseInt(m.group(6))); + fragment.setYpos(Integer.parseInt(m.group(7))); + fragment.setRead(Integer.parseInt(m.group(8))); + fragment.setFilterPassed("N".equals(m.group(9))); + fragment.setControlNumber(Integer.parseInt(m.group(10))); + fragment.setIndexSequence(m.group(11)); + } + return matches; + } + + private int readLineInto(Text dest) throws EOFException, IOException { + int bytesRead = lineReader.readLine(dest, MAX_LINE_LENGTH); + if (bytesRead <= 0) { + throw new EOFException(); + } + pos += bytesRead; + return bytesRead; + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FastqOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/FastqOutputFormat.java index 3550c89..d6ed1d7 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FastqOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FastqOutputFormat.java @@ -23,9 +23,8 @@ package org.seqdoop.hadoop_bam; import java.io.DataOutputStream; -import java.io.OutputStream; import java.io.IOException; - +import java.io.OutputStream; import java.nio.charset.Charset; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -38,147 +37,150 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; - import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding; -/** - * Output format for the fastq format. - */ +/** Output format for the fastq format. */ // If a key is provided with the SequencedFragment, the key is used as the sequence // id and the meta-info from the SequencedFragment (if any) is dropped. // If the key is null, then the format will attempt to create an // Illumina-style fastq id as specified in the Casava users' guide v1.8: -// @instrument:run number:flowcell ID:lane:tile:x-pos:y-pos \s+ read:is filtered:control number:index sequence +// @instrument:run number:flowcell ID:lane:tile:x-pos:y-pos \s+ read:is filtered:control +// number:index sequence // -public class FastqOutputFormat extends TextOutputFormat -{ - public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-output.base-quality-encoding"; - public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger"; - public static final Charset UTF8 = Charset.forName("UTF8"); - - static final byte[] PLUS_LINE; - static { - try { - PLUS_LINE = "\n+\n".getBytes("us-ascii"); - } catch (java.io.UnsupportedEncodingException e) { - throw new RuntimeException("us-ascii encoding not supported!"); - } - } - - public static class FastqRecordWriter extends RecordWriter - { - protected StringBuilder sBuilder = new StringBuilder(800); - protected Text buffer = new Text(); - protected OutputStream out; - protected BaseQualityEncoding baseQualityFormat; - - public FastqRecordWriter(Configuration conf, OutputStream out) - { - this.out = out; - setConf(conf); - } - - public void setConf(Configuration conf) - { - String setting = conf.get(CONF_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT); - if ("illumina".equals(setting)) - baseQualityFormat = BaseQualityEncoding.Illumina; - else if ("sanger".equals(setting)) - baseQualityFormat = BaseQualityEncoding.Sanger; - else - throw new RuntimeException("Invalid property value '" + setting + "' for " + CONF_BASE_QUALITY_ENCODING + ". Valid values are 'illumina' or 'sanger'"); - } - - protected String makeId(SequencedFragment seq) throws IOException - { - String delim = ":"; - sBuilder.delete(0, sBuilder.length()); // clear - - sBuilder.append( seq.getInstrument() == null ? "" : seq.getInstrument() ).append(delim); - sBuilder.append( seq.getRunNumber() == null ? "" : seq.getRunNumber().toString() ).append(delim); - sBuilder.append( seq.getFlowcellId() == null ? "" : seq.getFlowcellId() ).append(delim); - sBuilder.append( seq.getLane() == null ? "" : seq.getLane().toString() ).append(delim); - sBuilder.append( seq.getTile() == null ? "" : seq.getTile().toString() ).append(delim); - sBuilder.append( seq.getXpos() == null ? "" : seq.getXpos().toString() ).append(delim); - sBuilder.append( seq.getYpos() == null ? "" : seq.getYpos().toString() ); - - sBuilder.append(" "); // space - - sBuilder.append( seq.getRead() == null ? "" : seq.getRead().toString() ).append(delim); - sBuilder.append(seq.getFilterPassed() == null || seq.getFilterPassed() ? "N" : "Y"); - sBuilder.append(delim); - - sBuilder.append( seq.getControlNumber() == null ? "0" : seq.getControlNumber().toString()).append(delim); - sBuilder.append( seq.getIndexSequence() == null ? "" : seq.getIndexSequence()); - - return sBuilder.toString(); - } - - public void write(Text key, SequencedFragment seq) throws IOException - { - // write the id line - out.write('@'); - if (key != null) - out.write(key.getBytes(), 0, key.getLength()); - else - out.write(makeId(seq).getBytes(UTF8)); - out.write('\n'); - - // write the sequence and separator - out.write(seq.getSequence().getBytes(), 0, seq.getSequence().getLength()); - out.write(PLUS_LINE); - - // now the quality - if (baseQualityFormat == BaseQualityEncoding.Sanger) - out.write(seq.getQuality().getBytes(), 0, seq.getQuality().getLength()); - else if (baseQualityFormat == BaseQualityEncoding.Illumina) - { - buffer.set(seq.getQuality()); - SequencedFragment.convertQuality(buffer, BaseQualityEncoding.Sanger, baseQualityFormat); - out.write(buffer.getBytes(), 0, buffer.getLength()); - } - else - throw new RuntimeException("FastqOutputFormat: unknown base quality format " + baseQualityFormat); - - // and the final newline - out.write('\n'); - } - - public void close(TaskAttemptContext task) throws IOException - { - out.close(); - } +public class FastqOutputFormat extends TextOutputFormat { + + public static final String CONF_BASE_QUALITY_ENCODING = "hbam.fastq-output.base-quality-encoding"; + public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "sanger"; + public static final Charset UTF8 = Charset.forName("UTF8"); + + static final byte[] PLUS_LINE; + + static { + try { + PLUS_LINE = "\n+\n".getBytes("us-ascii"); + } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException("us-ascii encoding not supported!"); + } + } + + public RecordWriter getRecordWriter(TaskAttemptContext task) + throws IOException { + Configuration conf = task.getConfiguration(); + boolean isCompressed = getCompressOutput(task); + + CompressionCodec codec = null; + String extension = ""; + + if (isCompressed) { + Class codecClass = + getOutputCompressorClass(task, GzipCodec.class); + codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); + extension = codec.getDefaultExtension(); + } + + Path file = getDefaultWorkFile(task, extension); + FileSystem fs = file.getFileSystem(conf); + + OutputStream output; + + if (isCompressed) { + FSDataOutputStream fileOut = fs.create(file, false); + output = new DataOutputStream(codec.createOutputStream(fileOut)); + } else { + output = fs.create(file, false); + } + + return new FastqRecordWriter(conf, output); } - public RecordWriter getRecordWriter(TaskAttemptContext task) - throws IOException - { - Configuration conf = task.getConfiguration(); - boolean isCompressed = getCompressOutput(task); - - CompressionCodec codec = null; - String extension = ""; - - if (isCompressed) - { - Class codecClass = getOutputCompressorClass(task, GzipCodec.class); - codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); - extension = codec.getDefaultExtension(); - } - - Path file = getDefaultWorkFile(task, extension); - FileSystem fs = file.getFileSystem(conf); - - OutputStream output; - - if (isCompressed) - { - FSDataOutputStream fileOut = fs.create(file, false); - output = new DataOutputStream(codec.createOutputStream(fileOut)); - } - else - output = fs.create(file, false); - - return new FastqRecordWriter(conf, output); - } + public static class FastqRecordWriter extends RecordWriter { + + protected StringBuilder sBuilder = new StringBuilder(800); + protected Text buffer = new Text(); + protected OutputStream out; + protected BaseQualityEncoding baseQualityFormat; + + public FastqRecordWriter(Configuration conf, OutputStream out) { + this.out = out; + setConf(conf); + } + + public void setConf(Configuration conf) { + String setting = conf.get(CONF_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT); + if ("illumina".equals(setting)) { + baseQualityFormat = BaseQualityEncoding.Illumina; + } else if ("sanger".equals(setting)) { + baseQualityFormat = BaseQualityEncoding.Sanger; + } else { + throw new RuntimeException( + "Invalid property value '" + + setting + + "' for " + + CONF_BASE_QUALITY_ENCODING + + ". Valid values are 'illumina' or 'sanger'"); + } + } + + protected String makeId(SequencedFragment seq) throws IOException { + String delim = ":"; + sBuilder.delete(0, sBuilder.length()); // clear + + sBuilder.append(seq.getInstrument() == null ? "" : seq.getInstrument()).append(delim); + sBuilder + .append(seq.getRunNumber() == null ? "" : seq.getRunNumber().toString()) + .append(delim); + sBuilder.append(seq.getFlowcellId() == null ? "" : seq.getFlowcellId()).append(delim); + sBuilder.append(seq.getLane() == null ? "" : seq.getLane().toString()).append(delim); + sBuilder.append(seq.getTile() == null ? "" : seq.getTile().toString()).append(delim); + sBuilder.append(seq.getXpos() == null ? "" : seq.getXpos().toString()).append(delim); + sBuilder.append(seq.getYpos() == null ? "" : seq.getYpos().toString()); + + sBuilder.append(" "); // space + + sBuilder.append(seq.getRead() == null ? "" : seq.getRead().toString()).append(delim); + sBuilder.append(seq.getFilterPassed() == null || seq.getFilterPassed() ? "N" : "Y"); + sBuilder.append(delim); + + sBuilder + .append(seq.getControlNumber() == null ? "0" : seq.getControlNumber().toString()) + .append(delim); + sBuilder.append(seq.getIndexSequence() == null ? "" : seq.getIndexSequence()); + + return sBuilder.toString(); + } + + public void write(Text key, SequencedFragment seq) throws IOException { + // write the id line + out.write('@'); + if (key != null) { + out.write(key.getBytes(), 0, key.getLength()); + } else { + out.write(makeId(seq).getBytes(UTF8)); + } + out.write('\n'); + + // write the sequence and separator + out.write(seq.getSequence().getBytes(), 0, seq.getSequence().getLength()); + out.write(PLUS_LINE); + + // now the quality + if (baseQualityFormat == BaseQualityEncoding.Sanger) { + out.write(seq.getQuality().getBytes(), 0, seq.getQuality().getLength()); + } else if (baseQualityFormat == BaseQualityEncoding.Illumina) { + buffer.set(seq.getQuality()); + SequencedFragment.convertQuality(buffer, BaseQualityEncoding.Sanger, baseQualityFormat); + out.write(buffer.getBytes(), 0, buffer.getLength()); + } else { + throw new RuntimeException( + "FastqOutputFormat: unknown base quality format " + baseQualityFormat); + } + + // and the final newline + out.write('\n'); + } + + public void close(TaskAttemptContext task) throws IOException { + out.close(); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java b/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java index 5aa2a8a..506fabe 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FileVirtualSplit.java @@ -22,105 +22,127 @@ package org.seqdoop.hadoop_bam; -import java.io.DataOutput; import java.io.DataInput; +import java.io.DataOutput; import java.io.IOException; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; -/** Like a {@link org.apache.hadoop.mapreduce.lib.input.FileSplit}, but uses - * BGZF virtual offsets to fit with {@link - * htsjdk.samtools.util.BlockCompressedInputStream}. +/** + * Like a {@link org.apache.hadoop.mapreduce.lib.input.FileSplit}, but uses BGZF virtual offsets to + * fit with {@link htsjdk.samtools.util.BlockCompressedInputStream}. */ public class FileVirtualSplit extends InputSplit implements Writable { - private Path file; - private long vStart; - private long vEnd; - private final String[] locations; - private long[] intervalFilePointers; - - private static final String[] NO_LOCATIONS = {}; - - public FileVirtualSplit() { locations = NO_LOCATIONS; } - - public FileVirtualSplit(Path f, long vs, long ve, String[] locs) { - file = f; - vStart = vs; - vEnd = ve; - locations = locs; - } - - public FileVirtualSplit(Path f, long vs, long ve, String[] locs, long[] intervalFilePointers) { - file = f; - vStart = vs; - vEnd = ve; - locations = locs; - this.intervalFilePointers = intervalFilePointers; - } - - @Override public String[] getLocations() { return locations; } - - /** Inexact due to the nature of virtual offsets. - * - * We can't know how many blocks there are in between two file offsets, nor - * how large those blocks are. So this uses only the difference between the - * file offsets—unless that difference is zero, in which case the split is - * wholly contained in one block and thus we can give an exact result. - */ - @Override public long getLength() { - final long vsHi = vStart & ~0xffff; - final long veHi = vEnd & ~0xffff; - final long hiDiff = veHi - vsHi; - return hiDiff == 0 ? ((vEnd & 0xffff) - (vStart & 0xffff)) : hiDiff; - } - - public Path getPath() { return file; } - - /** Inclusive. */ - public long getStartVirtualOffset() { return vStart; } - - /** Exclusive. */ - public long getEndVirtualOffset() { return vEnd; } - - public void setStartVirtualOffset(long vo) { vStart = vo; } - public void setEndVirtualOffset(long vo) { vEnd = vo; } - - /** - * @return pairs of virtual file pointers for all intervals that should be used for - * filtering the split, or null if there are none. These correspond to - * BAMFileSpan chunk start/stop pointers in htsjdk. - */ - public long[] getIntervalFilePointers() { - return intervalFilePointers; - } - - @Override public void write(DataOutput out) throws IOException { - Text.writeString(out, file.toString()); - out.writeLong(vStart); - out.writeLong(vEnd); - out.writeBoolean(intervalFilePointers != null); - if (intervalFilePointers != null) { - out.writeInt(intervalFilePointers.length); - for (int i = 0; i < intervalFilePointers.length; i++) { - out.writeLong(intervalFilePointers[i]); - } - } - } - @Override public void readFields(DataInput in) throws IOException { - file = new Path(Text.readString(in)); - vStart = in.readLong(); - vEnd = in.readLong(); - if (in.readBoolean()) { - intervalFilePointers = new long[in.readInt()]; - for (int i = 0; i < intervalFilePointers.length; i++) { - intervalFilePointers[i] = in.readLong(); - } - } - } - - @Override - public String toString() { return file + ":" + vStart + "-" + vEnd; } + + private static final String[] NO_LOCATIONS = {}; + private final String[] locations; + private Path file; + private long vStart; + private long vEnd; + private long[] intervalFilePointers; + + public FileVirtualSplit() { + locations = NO_LOCATIONS; + } + + public FileVirtualSplit(Path f, long vs, long ve, String[] locs) { + file = f; + vStart = vs; + vEnd = ve; + locations = locs; + } + + public FileVirtualSplit(Path f, long vs, long ve, String[] locs, long[] intervalFilePointers) { + file = f; + vStart = vs; + vEnd = ve; + locations = locs; + this.intervalFilePointers = intervalFilePointers; + } + + @Override + public String[] getLocations() { + return locations; + } + + /** + * Inexact due to the nature of virtual offsets. + * + *

We can't know how many blocks there are in between two file offsets, nor how large those + * blocks are. So this uses only the difference between the file offsets—unless that difference is + * zero, in which case the split is wholly contained in one block and thus we can give an exact + * result. + */ + @Override + public long getLength() { + final long vsHi = vStart & ~0xffff; + final long veHi = vEnd & ~0xffff; + final long hiDiff = veHi - vsHi; + return hiDiff == 0 ? ((vEnd & 0xffff) - (vStart & 0xffff)) : hiDiff; + } + + public Path getPath() { + return file; + } + + /** Inclusive. */ + public long getStartVirtualOffset() { + return vStart; + } + + public void setStartVirtualOffset(long vo) { + vStart = vo; + } + + /** Exclusive. */ + public long getEndVirtualOffset() { + return vEnd; + } + + public void setEndVirtualOffset(long vo) { + vEnd = vo; + } + + /** + * @return pairs of virtual file pointers for all intervals that should be used for filtering the + * split, or null if there are none. These correspond to BAMFileSpan chunk + * start/stop pointers in htsjdk. + */ + public long[] getIntervalFilePointers() { + return intervalFilePointers; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, file.toString()); + out.writeLong(vStart); + out.writeLong(vEnd); + out.writeBoolean(intervalFilePointers != null); + if (intervalFilePointers != null) { + out.writeInt(intervalFilePointers.length); + for (int i = 0; i < intervalFilePointers.length; i++) { + out.writeLong(intervalFilePointers[i]); + } + } + } + + @Override + public void readFields(DataInput in) throws IOException { + file = new Path(Text.readString(in)); + vStart = in.readLong(); + vEnd = in.readLong(); + if (in.readBoolean()) { + intervalFilePointers = new long[in.readInt()]; + for (int i = 0; i < intervalFilePointers.length; i++) { + intervalFilePointers[i] = in.readLong(); + } + } + } + + @Override + public String toString() { + return file + ":" + vStart + "-" + vEnd; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java b/src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java index ec54531..bc02670 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FormatConstants.java @@ -22,38 +22,33 @@ package org.seqdoop.hadoop_bam; -public class FormatConstants -{ - /** - * Offset by which Sanger-style ASCII-encoded quality scores are shifted. - */ - public static final int SANGER_OFFSET = 33; - - /** - * Maximum encodable quality score for Sanger Phred+33 encoded base qualities. - * - * Range of legal values is [0,93], according to wikipedia on 10/9/2013: - * http://en.wikipedia.org/wiki/FASTQ_format#Quality - */ - public static final int SANGER_MAX = 93; - - /** - * Offset by which Illumina-style ASCII-encoded quality scores are shifted. - */ - public static final int ILLUMINA_OFFSET = 64; - - /** - * Maximum encodable quality score for Illumina Phred+64 encoded base qualities. - */ - public static final int ILLUMINA_MAX = 62; - - /** - * Encodings for base quality formats. - */ - public enum BaseQualityEncoding { Illumina, Sanger }; - - private FormatConstants() {} // no instantiation - - public static final String CONF_INPUT_BASE_QUALITY_ENCODING = "hbam.input.base-quality-encoding"; - public static final String CONF_INPUT_FILTER_FAILED_QC = "hbam.input.filter-failed-qc"; +public class FormatConstants { + + /** Offset by which Sanger-style ASCII-encoded quality scores are shifted. */ + public static final int SANGER_OFFSET = 33; + + /** + * Maximum encodable quality score for Sanger Phred+33 encoded base qualities. + * + *

Range of legal values is [0,93], according to wikipedia on 10/9/2013: + * http://en.wikipedia.org/wiki/FASTQ_format#Quality + */ + public static final int SANGER_MAX = 93; + + /** Offset by which Illumina-style ASCII-encoded quality scores are shifted. */ + public static final int ILLUMINA_OFFSET = 64; + + /** Maximum encodable quality score for Illumina Phred+64 encoded base qualities. */ + public static final int ILLUMINA_MAX = 62; + + public static final String CONF_INPUT_BASE_QUALITY_ENCODING = "hbam.input.base-quality-encoding";; + + public static final String CONF_INPUT_FILTER_FAILED_QC = "hbam.input.filter-failed-qc"; + + private FormatConstants() {} // no instantiation + /** Encodings for base quality formats. */ + public enum BaseQualityEncoding { + Illumina, + Sanger + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/FormatException.java b/src/main/java/org/seqdoop/hadoop_bam/FormatException.java index 208904b..ad9671c 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/FormatException.java +++ b/src/main/java/org/seqdoop/hadoop_bam/FormatException.java @@ -22,11 +22,11 @@ package org.seqdoop.hadoop_bam; -public class FormatException extends RuntimeException -{ - private static final long serialVersionUID = 1L; - public FormatException(String msg) - { - super(msg); - } +public class FormatException extends RuntimeException { + + private static final long serialVersionUID = 1L; + + public FormatException(String msg) { + super(msg); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java index 8093de0..89d01a7 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringAnySAMOutputFormat.java @@ -22,107 +22,102 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.SAMFileHeader; import java.io.IOException; import java.io.InputStream; - -import htsjdk.samtools.SAMFileHeader; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -/** Writes only the SAM records, not the key. +/** + * Writes only the SAM records, not the key. * - *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or - * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

+ *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or {@link + * #readSAMHeaderFrom} before {@link #getRecordWriter} is called. * - *

By default, writes the SAM header to the output file(s). This - * can be disabled, because in distributed usage one often ends up with (and, - * for decent performance, wants to end up with) the output split into multiple - * parts, which are easier to concatenate if the header is not present in each - * file.

+ *

By default, writes the SAM header to the output file(s). This can be disabled, because in + * distributed usage one often ends up with (and, for decent performance, wants to end up with) the + * output split into multiple parts, which are easier to concatenate if the header is not present in + * each file. */ public class KeyIgnoringAnySAMOutputFormat extends AnySAMOutputFormat { - protected SAMFileHeader header; - - /** Whether the header will be written, defaults to true.. - */ - public static final String WRITE_HEADER_PROPERTY = - "hadoopbam.anysam.write-header"; - - public KeyIgnoringAnySAMOutputFormat(SAMFormat fmt) { - super(fmt); - } - public KeyIgnoringAnySAMOutputFormat(Configuration conf) { - super(conf); - - if (format == null) - throw new IllegalArgumentException( - "unknown SAM format: OUTPUT_SAM_FORMAT_PROPERTY not set"); - } - public KeyIgnoringAnySAMOutputFormat(Configuration conf, Path path) { - super(conf); - - if (format == null) { - format = SAMFormat.inferFromFilePath(path); - - if (format == null) - throw new IllegalArgumentException("unknown SAM format: " + path); - } - } - - public SAMFileHeader getSAMHeader() { return header; } - public void setSAMHeader(SAMFileHeader header) { this.header = header; } - - public void readSAMHeaderFrom(Path path, Configuration conf) - throws IOException - { - this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); - } - public void readSAMHeaderFrom(InputStream in, Configuration conf) { - this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); - } - - /** setSAMHeader or readSAMHeaderFrom must have - * been called first. - */ - @Override public RecordWriter getRecordWriter( - TaskAttemptContext ctx) - throws IOException - { - return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); - } - - // Allows wrappers to provide their own work file. - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, Path out) - throws IOException - { - if (this.header == null) - throw new IOException( - "Can't create a RecordWriter without the SAM header"); - - final boolean writeHeader = ctx.getConfiguration().getBoolean( - WRITE_HEADER_PROPERTY, true); - - switch (format) { - case BAM: - return new KeyIgnoringBAMRecordWriter( - out, header, writeHeader, ctx); - - case SAM: - return new KeyIgnoringSAMRecordWriter( - out, header, writeHeader, ctx); - - case CRAM: - return new KeyIgnoringCRAMRecordWriter( - out, header, writeHeader, ctx); - - default: assert false; return null; - } - } + /** Whether the header will be written, defaults to true.. */ + public static final String WRITE_HEADER_PROPERTY = "hadoopbam.anysam.write-header"; + + protected SAMFileHeader header; + + public KeyIgnoringAnySAMOutputFormat(SAMFormat fmt) { + super(fmt); + } + + public KeyIgnoringAnySAMOutputFormat(Configuration conf) { + super(conf); + + if (format == null) { + throw new IllegalArgumentException("unknown SAM format: OUTPUT_SAM_FORMAT_PROPERTY not set"); + } + } + + public KeyIgnoringAnySAMOutputFormat(Configuration conf, Path path) { + super(conf); + + if (format == null) { + format = SAMFormat.inferFromFilePath(path); + + if (format == null) { + throw new IllegalArgumentException("unknown SAM format: " + path); + } + } + } + + public SAMFileHeader getSAMHeader() { + return header; + } + + public void setSAMHeader(SAMFileHeader header) { + this.header = header; + } + + public void readSAMHeaderFrom(Path path, Configuration conf) throws IOException { + this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); + } + + public void readSAMHeaderFrom(InputStream in, Configuration conf) { + this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); + } + + /** setSAMHeader or readSAMHeaderFrom must have been called first. */ + @Override + public RecordWriter getRecordWriter(TaskAttemptContext ctx) + throws IOException { + return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); + } + + // Allows wrappers to provide their own work file. + public RecordWriter getRecordWriter(TaskAttemptContext ctx, Path out) + throws IOException { + if (this.header == null) { + throw new IOException("Can't create a RecordWriter without the SAM header"); + } + + final boolean writeHeader = ctx.getConfiguration().getBoolean(WRITE_HEADER_PROPERTY, true); + + switch (format) { + case BAM: + return new KeyIgnoringBAMRecordWriter(out, header, writeHeader, ctx); + + case SAM: + return new KeyIgnoringSAMRecordWriter(out, header, writeHeader, ctx); + + case CRAM: + return new KeyIgnoringCRAMRecordWriter(out, header, writeHeader, ctx); + + default: + assert false; + return null; + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java index 4670214..ea62148 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMOutputFormat.java @@ -22,72 +22,73 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.SAMFileHeader; import java.io.IOException; import java.io.InputStream; - -import htsjdk.samtools.SAMFileHeader; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -/** Writes only the BAM records, not the key. +/** + * Writes only the BAM records, not the key. * - *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or - * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

+ *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or {@link + * #readSAMHeaderFrom} before {@link #getRecordWriter} is called. * - *

By default, writes the SAM header to the output file(s). This - * can be disabled, because in distributed usage one often ends up with (and, - * for decent performance, wants to end up with) the output split into multiple - * parts, which are easier to concatenate if the header is not present in each - * file.

+ *

By default, writes the SAM header to the output file(s). This can be disabled, because in + * distributed usage one often ends up with (and, for decent performance, wants to end up with) the + * output split into multiple parts, which are easier to concatenate if the header is not present in + * each file. */ public class KeyIgnoringBAMOutputFormat extends BAMOutputFormat { - protected SAMFileHeader header; - private boolean writeHeader = true; - public KeyIgnoringBAMOutputFormat() {} + protected SAMFileHeader header; + private boolean writeHeader = true; + + public KeyIgnoringBAMOutputFormat() {} + + /** Whether the header will be written or not. */ + public boolean getWriteHeader() { + return writeHeader; + } + + /** Set whether the header will be written or not. */ + public void setWriteHeader(boolean b) { + writeHeader = b; + } - /** Whether the header will be written or not. */ - public boolean getWriteHeader() { return writeHeader; } + public SAMFileHeader getSAMHeader() { + return header; + } - /** Set whether the header will be written or not. */ - public void setWriteHeader(boolean b) { writeHeader = b; } + public void setSAMHeader(SAMFileHeader header) { + this.header = header; + } - public SAMFileHeader getSAMHeader() { return header; } - public void setSAMHeader(SAMFileHeader header) { this.header = header; } + public void readSAMHeaderFrom(Path path, Configuration conf) throws IOException { + this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); + } - public void readSAMHeaderFrom(Path path, Configuration conf) - throws IOException - { - this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); - } - public void readSAMHeaderFrom(InputStream in, Configuration conf) { - this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); - } + public void readSAMHeaderFrom(InputStream in, Configuration conf) { + this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); + } - /** setSAMHeader or readSAMHeaderFrom must have - * been called first. - */ - @Override public RecordWriter getRecordWriter( - TaskAttemptContext ctx) - throws IOException - { - return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); - } + /** setSAMHeader or readSAMHeaderFrom must have been called first. */ + @Override + public RecordWriter getRecordWriter(TaskAttemptContext ctx) + throws IOException { + return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); + } - // Allows wrappers to provide their own work file. - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, Path out) - throws IOException - { - if (this.header == null) - throw new IOException( - "Can't create a RecordWriter without the SAM header"); + // Allows wrappers to provide their own work file. + public RecordWriter getRecordWriter(TaskAttemptContext ctx, Path out) + throws IOException { + if (this.header == null) { + throw new IOException("Can't create a RecordWriter without the SAM header"); + } - return new KeyIgnoringBAMRecordWriter(out, header, writeHeader, ctx); - } + return new KeyIgnoringBAMRecordWriter(out, header, writeHeader, ctx); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java index 270f095..c630be9 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBAMRecordWriter.java @@ -22,34 +22,31 @@ package org.seqdoop.hadoop_bam; -import java.io.IOException; -import java.io.OutputStream; - import htsjdk.samtools.SAMFileHeader; - +import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.TaskAttemptContext; -/** A convenience class that you can use as a RecordWriter for BAM files. +/** + * A convenience class that you can use as a RecordWriter for BAM files. * - *

The write function ignores the key, just outputting the SAMRecord.

+ *

The write function ignores the key, just outputting the SAMRecord. */ public class KeyIgnoringBAMRecordWriter extends BAMRecordWriter { - public KeyIgnoringBAMRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - super(output, input, writeHeader, ctx); - } - public KeyIgnoringBAMRecordWriter( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - super(output, header, writeHeader, ctx); - } - @Override public void write(K ignored, SAMRecordWritable rec) throws IOException { - writeAlignment(rec.get()); - } + public KeyIgnoringBAMRecordWriter( + Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) throws IOException { + super(output, input, writeHeader, ctx); + } + + public KeyIgnoringBAMRecordWriter( + Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + super(output, header, writeHeader, ctx); + } + + @Override + public void write(K ignored, SAMRecordWritable rec) throws IOException { + writeAlignment(rec.get()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java index 17080e9..f51d888 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringBCFRecordWriter.java @@ -22,41 +22,37 @@ package org.seqdoop.hadoop_bam; +import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; import java.io.OutputStream; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import htsjdk.variant.vcf.VCFHeader; - -/** A convenience class that you can use as a RecordWriter for BCF files. +/** + * A convenience class that you can use as a RecordWriter for BCF files. * - *

The write function ignores the key, just outputting the - * VariantContext.

+ *

The write function ignores the key, just outputting the VariantContext. */ public class KeyIgnoringBCFRecordWriter extends BCFRecordWriter { - public KeyIgnoringBCFRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - super(output, input, writeHeader, ctx); - } - public KeyIgnoringBCFRecordWriter( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - super(output, header, writeHeader, ctx); - } - public KeyIgnoringBCFRecordWriter( - OutputStream output, VCFHeader header, boolean writeHeader) - throws IOException - { - super(output, header, writeHeader); - } - - @Override public void write(K ignored, VariantContextWritable vc) { - writeRecord(vc.get()); - } + + public KeyIgnoringBCFRecordWriter( + Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) throws IOException { + super(output, input, writeHeader, ctx); + } + + public KeyIgnoringBCFRecordWriter( + Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + super(output, header, writeHeader, ctx); + } + + public KeyIgnoringBCFRecordWriter(OutputStream output, VCFHeader header, boolean writeHeader) + throws IOException { + super(output, header, writeHeader); + } + + @Override + public void write(K ignored, VariantContextWritable vc) { + writeRecord(vc.get()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java index c7a4126..79d8eb0 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMOutputFormat.java @@ -1,71 +1,72 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.SAMFileHeader; import java.io.IOException; import java.io.InputStream; - -import htsjdk.samtools.SAMFileHeader; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -/** Writes only the BAM records, not the key. +/** + * Writes only the BAM records, not the key. * - *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or - * {@link #readSAMHeaderFrom} before {@link #getRecordWriter} is called.

+ *

A {@link SAMFileHeader} must be provided via {@link #setSAMHeader} or {@link + * #readSAMHeaderFrom} before {@link #getRecordWriter} is called. * - *

By default, writes the SAM header to the output file(s). This - * can be disabled, because in distributed usage one often ends up with (and, - * for decent performance, wants to end up with) the output split into multiple - * parts, which are easier to concatenate if the header is not present in each - * file.

+ *

By default, writes the SAM header to the output file(s). This can be disabled, because in + * distributed usage one often ends up with (and, for decent performance, wants to end up with) the + * output split into multiple parts, which are easier to concatenate if the header is not present in + * each file. */ public class KeyIgnoringCRAMOutputFormat extends CRAMOutputFormat { - protected SAMFileHeader header; - private boolean writeHeader = true; - public KeyIgnoringCRAMOutputFormat() {} + protected SAMFileHeader header; + private boolean writeHeader = true; - /** Whether the header will be written or not. */ - public boolean getWriteHeader() { return writeHeader; } + public KeyIgnoringCRAMOutputFormat() {} - /** Set whether the header will be written or not. */ - public void setWriteHeader(boolean b) { writeHeader = b; } + /** Whether the header will be written or not. */ + public boolean getWriteHeader() { + return writeHeader; + } - public SAMFileHeader getSAMHeader() { return header; } - public void setSAMHeader(SAMFileHeader header) { this.header = header; } + /** Set whether the header will be written or not. */ + public void setWriteHeader(boolean b) { + writeHeader = b; + } - public void readSAMHeaderFrom(Path path, Configuration conf) - throws IOException - { - this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); - } - public void readSAMHeaderFrom(InputStream in, Configuration conf) { - this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); - } + public SAMFileHeader getSAMHeader() { + return header; + } - /** setSAMHeader or readSAMHeaderFrom must have - * been called first. - */ - @Override public RecordWriter getRecordWriter( - TaskAttemptContext ctx) - throws IOException - { - return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); - } + public void setSAMHeader(SAMFileHeader header) { + this.header = header; + } + + public void readSAMHeaderFrom(Path path, Configuration conf) throws IOException { + this.header = SAMHeaderReader.readSAMHeaderFrom(path, conf); + } - // Allows wrappers to provide their own work file. - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, Path out) - throws IOException - { - if (this.header == null) - throw new IOException( - "Can't create a RecordWriter without the SAM header"); + public void readSAMHeaderFrom(InputStream in, Configuration conf) { + this.header = SAMHeaderReader.readSAMHeaderFrom(in, conf); + } - return new KeyIgnoringCRAMRecordWriter(out, header, writeHeader, ctx); + /** setSAMHeader or readSAMHeaderFrom must have been called first. */ + @Override + public RecordWriter getRecordWriter(TaskAttemptContext ctx) + throws IOException { + return getRecordWriter(ctx, getDefaultWorkFile(ctx, "")); + } + + // Allows wrappers to provide their own work file. + public RecordWriter getRecordWriter(TaskAttemptContext ctx, Path out) + throws IOException { + if (this.header == null) { + throw new IOException("Can't create a RecordWriter without the SAM header"); } + + return new KeyIgnoringCRAMRecordWriter(out, header, writeHeader, ctx); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java index c190741..1657742 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringCRAMRecordWriter.java @@ -1,33 +1,30 @@ package org.seqdoop.hadoop_bam; import htsjdk.samtools.SAMFileHeader; +import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import java.io.IOException; -import java.io.OutputStream; - -/** A convenience class that you can use as a RecordWriter for CRAM files. +/** + * A convenience class that you can use as a RecordWriter for CRAM files. * - *

The write function ignores the key, just outputting the SAMRecord.

+ *

The write function ignores the key, just outputting the SAMRecord. */ public class KeyIgnoringCRAMRecordWriter extends CRAMRecordWriter { - public KeyIgnoringCRAMRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - super(output, input, writeHeader, ctx); - } - public KeyIgnoringCRAMRecordWriter( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - super(output, header, writeHeader, ctx); - } + public KeyIgnoringCRAMRecordWriter( + Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) throws IOException { + super(output, input, writeHeader, ctx); + } + + public KeyIgnoringCRAMRecordWriter( + Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + super(output, header, writeHeader, ctx); + } - @Override public void write(K ignored, SAMRecordWritable rec) { - writeAlignment(rec.get()); - } + @Override + public void write(K ignored, SAMRecordWritable rec) { + writeAlignment(rec.get()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java index 9fc3249..ac8c787 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringSAMRecordWriter.java @@ -22,40 +22,37 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.SAMFileHeader; import java.io.IOException; import java.io.OutputStream; - -import htsjdk.samtools.SAMFileHeader; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.TaskAttemptContext; -/** A convenience class that you can use as a RecordWriter for SAM files. +/** + * A convenience class that you can use as a RecordWriter for SAM files. * - *

The write function ignores the key, just outputting the SAMRecord.

+ *

The write function ignores the key, just outputting the SAMRecord. */ public class KeyIgnoringSAMRecordWriter extends SAMRecordWriter { - public KeyIgnoringSAMRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - super(output, input, writeHeader, ctx); - } - public KeyIgnoringSAMRecordWriter( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - super(output, header, writeHeader, ctx); - } - public KeyIgnoringSAMRecordWriter( - OutputStream output, SAMFileHeader header, boolean writeHeader) - throws IOException - { - super(output, header, writeHeader); - } - - @Override public void write(K ignored, SAMRecordWritable rec) { - writeAlignment(rec.get()); - } + + public KeyIgnoringSAMRecordWriter( + Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) throws IOException { + super(output, input, writeHeader, ctx); + } + + public KeyIgnoringSAMRecordWriter( + Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + super(output, header, writeHeader, ctx); + } + + public KeyIgnoringSAMRecordWriter(OutputStream output, SAMFileHeader header, boolean writeHeader) + throws IOException { + super(output, header, writeHeader); + } + + @Override + public void write(K ignored, SAMRecordWritable rec) { + writeAlignment(rec.get()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFOutputFormat.java index 04282c8..9674c92 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFOutputFormat.java @@ -22,8 +22,9 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.seekablestream.SeekableStream; +import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; - import java.io.OutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -31,122 +32,127 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.variant.vcf.VCFHeader; - import org.apache.hadoop.util.ReflectionUtils; import org.seqdoop.hadoop_bam.util.BGZFCodec; import org.seqdoop.hadoop_bam.util.VCFHeaderReader; import org.seqdoop.hadoop_bam.util.WrapSeekable; -/** Writes only the VCF records, not the key. +/** + * Writes only the VCF records, not the key. * - *

A {@link VCFHeader} must be provided via {@link #setHeader} or {@link - * #readHeaderFrom} before {@link #getRecordWriter} is called.

+ *

A {@link VCFHeader} must be provided via {@link #setHeader} or {@link #readHeaderFrom} before + * {@link #getRecordWriter} is called. * - *

By default, writes the VCF header to the output file(s). This can be - * disabled, because in distributed usage one often ends up with (and, for - * decent performance, wants to end up with) the output split into multiple - * parts, which are easier to concatenate if the header is not present in each - * file.

+ *

By default, writes the VCF header to the output file(s). This can be disabled, because in + * distributed usage one often ends up with (and, for decent performance, wants to end up with) the + * output split into multiple parts, which are easier to concatenate if the header is not present in + * each file. */ public class KeyIgnoringVCFOutputFormat extends VCFOutputFormat { - protected VCFHeader header; - - public KeyIgnoringVCFOutputFormat(VCFFormat fmt) { super(fmt); } - public KeyIgnoringVCFOutputFormat(Configuration conf) { - super(conf); - if (format == null) - throw new IllegalArgumentException( - "unknown VCF format: OUTPUT_VCF_FORMAT_PROPERTY not set"); - } - public KeyIgnoringVCFOutputFormat(Configuration conf, Path path) { - super(conf); - if (format == null) { - format = VCFFormat.inferFromFilePath(path); - - if (format == null) - throw new IllegalArgumentException("unknown VCF format: " + path); - } - } - - /** Whether the header will be written, defaults to true. */ - public static final String WRITE_HEADER_PROPERTY = - "hadoopbam.vcf.write-header"; - - public VCFHeader getHeader() { return header; } - public void setHeader(VCFHeader header) { this.header = header; } - - public void readHeaderFrom(Path path, FileSystem fs) throws IOException { - SeekableStream i = WrapSeekable.openPath(fs, path); - readHeaderFrom(i); - i.close(); - } - public void readHeaderFrom(SeekableStream in) throws IOException { - this.header = VCFHeaderReader.readHeaderFrom(in); - } - - /** setHeader or readHeaderFrom must have been - * called first. - */ - @Override public RecordWriter getRecordWriter( - TaskAttemptContext ctx) - throws IOException - { - Configuration conf = ctx.getConfiguration(); - boolean isCompressed = getCompressOutput(ctx); - CompressionCodec codec = null; - String extension = ""; - if (isCompressed) { - Class codecClass = - getOutputCompressorClass(ctx, BGZFCodec.class); - codec = ReflectionUtils.newInstance(codecClass, conf); - extension = codec.getDefaultExtension(); - } - Path file = getDefaultWorkFile(ctx, extension); - if (!isCompressed) { - return getRecordWriter(ctx, file); - } else { - FileSystem fs = file.getFileSystem(conf); - return getRecordWriter(ctx, codec.createOutputStream(fs.create(file))); - } - } - - // Allows wrappers to provide their own work file. - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, Path out) - throws IOException - { - if (this.header == null) - throw new IOException( - "Can't create a RecordWriter without the VCF header"); - - final boolean wh = ctx.getConfiguration().getBoolean( - WRITE_HEADER_PROPERTY, true); - - switch (format) { - case BCF: return new KeyIgnoringBCFRecordWriter(out,header,wh,ctx); - case VCF: return new KeyIgnoringVCFRecordWriter(out,header,wh,ctx); - default: assert false; return null; - } - } - - private RecordWriter getRecordWriter( - TaskAttemptContext ctx, OutputStream outputStream) - throws IOException - { - if (this.header == null) - throw new IOException( - "Can't create a RecordWriter without the VCF header"); - - final boolean wh = ctx.getConfiguration().getBoolean( - WRITE_HEADER_PROPERTY, true); - - switch (format) { - case BCF: return new KeyIgnoringBCFRecordWriter(outputStream,header,wh); - case VCF: return new KeyIgnoringVCFRecordWriter(outputStream,header,wh); - default: assert false; return null; - } - } + + /** Whether the header will be written, defaults to true. */ + public static final String WRITE_HEADER_PROPERTY = "hadoopbam.vcf.write-header"; + + protected VCFHeader header; + + public KeyIgnoringVCFOutputFormat(VCFFormat fmt) { + super(fmt); + } + + public KeyIgnoringVCFOutputFormat(Configuration conf) { + super(conf); + if (format == null) { + throw new IllegalArgumentException("unknown VCF format: OUTPUT_VCF_FORMAT_PROPERTY not set"); + } + } + + public KeyIgnoringVCFOutputFormat(Configuration conf, Path path) { + super(conf); + if (format == null) { + format = VCFFormat.inferFromFilePath(path); + + if (format == null) { + throw new IllegalArgumentException("unknown VCF format: " + path); + } + } + } + + public VCFHeader getHeader() { + return header; + } + + public void setHeader(VCFHeader header) { + this.header = header; + } + + public void readHeaderFrom(Path path, FileSystem fs) throws IOException { + SeekableStream i = WrapSeekable.openPath(fs, path); + readHeaderFrom(i); + i.close(); + } + + public void readHeaderFrom(SeekableStream in) throws IOException { + this.header = VCFHeaderReader.readHeaderFrom(in); + } + + /** setHeader or readHeaderFrom must have been called first. */ + @Override + public RecordWriter getRecordWriter(TaskAttemptContext ctx) + throws IOException { + Configuration conf = ctx.getConfiguration(); + boolean isCompressed = getCompressOutput(ctx); + CompressionCodec codec = null; + String extension = ""; + if (isCompressed) { + Class codecClass = getOutputCompressorClass(ctx, BGZFCodec.class); + codec = ReflectionUtils.newInstance(codecClass, conf); + extension = codec.getDefaultExtension(); + } + Path file = getDefaultWorkFile(ctx, extension); + if (!isCompressed) { + return getRecordWriter(ctx, file); + } else { + FileSystem fs = file.getFileSystem(conf); + return getRecordWriter(ctx, codec.createOutputStream(fs.create(file))); + } + } + + // Allows wrappers to provide their own work file. + public RecordWriter getRecordWriter(TaskAttemptContext ctx, Path out) + throws IOException { + if (this.header == null) { + throw new IOException("Can't create a RecordWriter without the VCF header"); + } + + final boolean wh = ctx.getConfiguration().getBoolean(WRITE_HEADER_PROPERTY, true); + + switch (format) { + case BCF: + return new KeyIgnoringBCFRecordWriter(out, header, wh, ctx); + case VCF: + return new KeyIgnoringVCFRecordWriter(out, header, wh, ctx); + default: + assert false; + return null; + } + } + + private RecordWriter getRecordWriter( + TaskAttemptContext ctx, OutputStream outputStream) throws IOException { + if (this.header == null) { + throw new IOException("Can't create a RecordWriter without the VCF header"); + } + + final boolean wh = ctx.getConfiguration().getBoolean(WRITE_HEADER_PROPERTY, true); + + switch (format) { + case BCF: + return new KeyIgnoringBCFRecordWriter(outputStream, header, wh); + case VCF: + return new KeyIgnoringVCFRecordWriter(outputStream, header, wh); + default: + assert false; + return null; + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java index d3fb1c5..f89a506 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/KeyIgnoringVCFRecordWriter.java @@ -22,41 +22,37 @@ package org.seqdoop.hadoop_bam; +import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; import java.io.OutputStream; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import htsjdk.variant.vcf.VCFHeader; - -/** A convenience class that you can use as a RecordWriter for VCF files. +/** + * A convenience class that you can use as a RecordWriter for VCF files. * - *

The write function ignores the key, just outputting the - * VariantContext.

+ *

The write function ignores the key, just outputting the VariantContext. */ public class KeyIgnoringVCFRecordWriter extends VCFRecordWriter { - public KeyIgnoringVCFRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - super(output, input, writeHeader, ctx); - } - public KeyIgnoringVCFRecordWriter( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - super(output, header, writeHeader, ctx); - } - public KeyIgnoringVCFRecordWriter( - OutputStream output, VCFHeader header, boolean writeHeader) - throws IOException - { - super(output, header, writeHeader); - } - - @Override public void write(K ignored, VariantContextWritable vc) { - writeRecord(vc.get()); - } + + public KeyIgnoringVCFRecordWriter( + Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) throws IOException { + super(output, input, writeHeader, ctx); + } + + public KeyIgnoringVCFRecordWriter( + Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + super(output, header, writeHeader, ctx); + } + + public KeyIgnoringVCFRecordWriter(OutputStream output, VCFHeader header, boolean writeHeader) + throws IOException { + super(output, header, writeHeader); + } + + @Override + public void write(K ignored, VariantContextWritable vc) { + writeRecord(vc.get()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java index 3b9b0d3..3a908b1 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java @@ -29,83 +29,125 @@ /** A factory for the kind of lazy {@link BAMRecord} used internally. */ public class LazyBAMRecordFactory implements SAMRecordFactory { - @Override public SAMRecord createSAMRecord(SAMFileHeader hdr) { - throw new UnsupportedOperationException( - "LazyBAMRecordFactory can only create BAM records"); - } - - @Override public BAMRecord createBAMRecord( - SAMFileHeader hdr, - int referenceSequenceIndex, int alignmentStart, - short readNameLength, short mappingQuality, - int indexingBin, int cigarLen, int flags, int readLen, - int mateReferenceSequenceIndex, int mateAlignmentStart, - int insertSize, byte[] variableLengthBlock) - { - return new LazyBAMRecord( - hdr, referenceSequenceIndex, alignmentStart, readNameLength, - mappingQuality, indexingBin, cigarLen, flags, readLen, - mateReferenceSequenceIndex, mateAlignmentStart, insertSize, - variableLengthBlock); - } + + @Override + public SAMRecord createSAMRecord(SAMFileHeader hdr) { + throw new UnsupportedOperationException("LazyBAMRecordFactory can only create BAM records"); + } + + @Override + public BAMRecord createBAMRecord( + SAMFileHeader hdr, + int referenceSequenceIndex, + int alignmentStart, + short readNameLength, + short mappingQuality, + int indexingBin, + int cigarLen, + int flags, + int readLen, + int mateReferenceSequenceIndex, + int mateAlignmentStart, + int insertSize, + byte[] variableLengthBlock) { + return new LazyBAMRecord( + hdr, + referenceSequenceIndex, + alignmentStart, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceSequenceIndex, + mateAlignmentStart, + insertSize, + variableLengthBlock); + } } class LazyBAMRecord extends BAMRecord { - private boolean decodedRefIdx = false; - private boolean decodedMateRefIdx = false; - - public LazyBAMRecord( - SAMFileHeader hdr, int referenceID, int coordinate, short readNameLength, - short mappingQuality, int indexingBin, int cigarLen, int flags, - int readLen, int mateReferenceID, int mateCoordinate, int insertSize, - byte[] restOfData) - { - super( - hdr, referenceID, coordinate, readNameLength, mappingQuality, - indexingBin, cigarLen, flags, readLen, mateReferenceID, - mateCoordinate, insertSize, restOfData); - } - - @Override public void setReferenceIndex(final int referenceIndex) { - mReferenceIndex = referenceIndex; - decodedRefIdx = false; - } - @Override public void setMateReferenceIndex(final int referenceIndex) { - mMateReferenceIndex = referenceIndex; - decodedMateRefIdx = false; - } - - @Override public String getReferenceName() { - if (mReferenceIndex != null && !decodedRefIdx) { - decodedRefIdx = true; - super.setReferenceIndex(mReferenceIndex); - } - return super.getReferenceName(); - } - - @Override public String getMateReferenceName() { - if (mMateReferenceIndex != null && !decodedMateRefIdx) { - decodedMateRefIdx = true; - super.setMateReferenceIndex(mMateReferenceIndex); - } - return super.getMateReferenceName(); - } - - @Override protected void eagerDecode() { - getReferenceName(); - getMateReferenceName(); - super.eagerDecode(); - } - - @Override - public boolean equals(Object o) { - // don't use decoded flags for equality check - return super.equals(o); - } - - @Override - public int hashCode() { - // don't use decoded flags for hash code - return super.hashCode(); - } + + private boolean decodedRefIdx = false; + private boolean decodedMateRefIdx = false; + + public LazyBAMRecord( + SAMFileHeader hdr, + int referenceID, + int coordinate, + short readNameLength, + short mappingQuality, + int indexingBin, + int cigarLen, + int flags, + int readLen, + int mateReferenceID, + int mateCoordinate, + int insertSize, + byte[] restOfData) { + super( + hdr, + referenceID, + coordinate, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceID, + mateCoordinate, + insertSize, + restOfData); + } + + @Override + public void setReferenceIndex(final int referenceIndex) { + mReferenceIndex = referenceIndex; + decodedRefIdx = false; + } + + @Override + public void setMateReferenceIndex(final int referenceIndex) { + mMateReferenceIndex = referenceIndex; + decodedMateRefIdx = false; + } + + @Override + public String getReferenceName() { + if (mReferenceIndex != null && !decodedRefIdx) { + decodedRefIdx = true; + super.setReferenceIndex(mReferenceIndex); + } + return super.getReferenceName(); + } + + @Override + public String getMateReferenceName() { + if (mMateReferenceIndex != null && !decodedMateRefIdx) { + decodedMateRefIdx = true; + super.setMateReferenceIndex(mMateReferenceIndex); + } + return super.getMateReferenceName(); + } + + @Override + protected void eagerDecode() { + getReferenceName(); + getMateReferenceName(); + super.eagerDecode(); + } + + @Override + public boolean equals(Object o) { + // don't use decoded flags for equality check + return super.equals(o); + } + + @Override + public int hashCode() { + // don't use decoded flags for hash code + return super.hashCode(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyBCFGenotypesContext.java b/src/main/java/org/seqdoop/hadoop_bam/LazyBCFGenotypesContext.java index 0de8cb6..f244823 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyBCFGenotypesContext.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyBCFGenotypesContext.java @@ -22,11 +22,6 @@ package org.seqdoop.hadoop_bam; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - import htsjdk.tribble.TribbleException; import htsjdk.variant.bcf2.BCF2Decoder; import htsjdk.variant.bcf2.BCF2GenotypeFieldDecoders; @@ -36,114 +31,123 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.vcf.VCFHeader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; // XXX: Since we cannot use BCF2LazyGenotypesDecoder, the parsing functionality // is, unfortunately, simply copied from there. public class LazyBCFGenotypesContext extends LazyParsingGenotypesContext { - /** Takes ownership of the given byte[]: don't modify its contents. */ - public LazyBCFGenotypesContext( - List alleles, int fields, byte[] unparsed, int count) - { - super(new Parser(alleles, fields), unparsed, count); - } - - public static class HeaderDataCache - implements LazyParsingGenotypesContext.HeaderDataCache - { - public static final BCF2Decoder decoder = new BCF2Decoder(); - - private BCF2GenotypeFieldDecoders genoFieldDecoders; - private List fieldDict; - private GenotypeBuilder[] builders; - - private ArrayList sampleNamesInOrder; - private HashMap sampleNameToOffset; - - @Override public void setHeader(VCFHeader header) { - genoFieldDecoders = new BCF2GenotypeFieldDecoders(header); - fieldDict = BCF2Utils.makeDictionary(header); - - builders = new GenotypeBuilder[header.getNGenotypeSamples()]; - final List genotypeSamples = header.getGenotypeSamples(); - for (int i = 0; i < builders.length; ++i) - builders[i] = new GenotypeBuilder(genotypeSamples.get(i)); - - sampleNamesInOrder = header.getSampleNamesInOrder(); - sampleNameToOffset = header.getSampleNameToOffset(); - } - - public BCF2GenotypeFieldDecoders getGenoFieldDecoders() { - return genoFieldDecoders; - } - public List getFieldDict() { return fieldDict; } - public GenotypeBuilder[] getBuilders () { return builders; } - - public ArrayList getSampleNamesInOrder() { - return sampleNamesInOrder; - } - public HashMap getSampleNameToOffset() { - return sampleNameToOffset; - } - } - - public static class Parser extends LazyParsingGenotypesContext.Parser { - private final List alleles; - private final int fields; - - private HeaderDataCache hd = null; - - public Parser(List alleles, int fields) { - this.alleles = alleles; - this.fields = fields; - } - - @Override public void setHeaderDataCache( - LazyParsingGenotypesContext.HeaderDataCache data) - { - this.hd = (HeaderDataCache)data; - } - - @Override public LazyGenotypesContext.LazyData parse(final Object data) { - if (hd == null) - throw new IllegalStateException( - "Cannot decode genotypes without HeaderDataCache"); - - final GenotypeBuilder[] builders = hd.getBuilders(); - - // The following is essentially the contents of - // BCF2LazyGenotypesDecoder.parse(). - - try { - hd.decoder.setRecordBytes((byte[])data); - - for (final GenotypeBuilder gb : builders) - gb.reset(true); - - for (int i = 0; i < fields; ++i) { - final String field = - hd.getFieldDict().get( - (Integer)hd.decoder.decodeTypedValue()); - - final byte type = hd.decoder.readTypeDescriptor(); - final int numElems = hd.decoder.decodeNumberOfElements(type); - - hd.getGenoFieldDecoders().getDecoder(field).decode( - alleles, field, hd.decoder, type, numElems, builders); - } - - final ArrayList genotypes = - new ArrayList(builders.length); - for (final GenotypeBuilder gb : builders) - genotypes.add(gb.make()); - - return new LazyGenotypesContext.LazyData( - genotypes, - hd.getSampleNamesInOrder(), hd.getSampleNameToOffset()); - } catch (IOException e) { - throw new TribbleException( - "Unexpected IOException parsing genotypes data block", e); - } - } - } + /** Takes ownership of the given byte[]: don't modify its contents. */ + public LazyBCFGenotypesContext(List alleles, int fields, byte[] unparsed, int count) { + super(new Parser(alleles, fields), unparsed, count); + } + + public static class HeaderDataCache implements LazyParsingGenotypesContext.HeaderDataCache { + + public static final BCF2Decoder decoder = new BCF2Decoder(); + + private BCF2GenotypeFieldDecoders genoFieldDecoders; + private List fieldDict; + private GenotypeBuilder[] builders; + + private ArrayList sampleNamesInOrder; + private HashMap sampleNameToOffset; + + @Override + public void setHeader(VCFHeader header) { + genoFieldDecoders = new BCF2GenotypeFieldDecoders(header); + fieldDict = BCF2Utils.makeDictionary(header); + + builders = new GenotypeBuilder[header.getNGenotypeSamples()]; + final List genotypeSamples = header.getGenotypeSamples(); + for (int i = 0; i < builders.length; ++i) { + builders[i] = new GenotypeBuilder(genotypeSamples.get(i)); + } + + sampleNamesInOrder = header.getSampleNamesInOrder(); + sampleNameToOffset = header.getSampleNameToOffset(); + } + + public BCF2GenotypeFieldDecoders getGenoFieldDecoders() { + return genoFieldDecoders; + } + + public List getFieldDict() { + return fieldDict; + } + + public GenotypeBuilder[] getBuilders() { + return builders; + } + + public ArrayList getSampleNamesInOrder() { + return sampleNamesInOrder; + } + + public HashMap getSampleNameToOffset() { + return sampleNameToOffset; + } + } + + public static class Parser extends LazyParsingGenotypesContext.Parser { + + private final List alleles; + private final int fields; + + private HeaderDataCache hd = null; + + public Parser(List alleles, int fields) { + this.alleles = alleles; + this.fields = fields; + } + + @Override + public void setHeaderDataCache(LazyParsingGenotypesContext.HeaderDataCache data) { + this.hd = (HeaderDataCache) data; + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + if (hd == null) { + throw new IllegalStateException("Cannot decode genotypes without HeaderDataCache"); + } + + final GenotypeBuilder[] builders = hd.getBuilders(); + + // The following is essentially the contents of + // BCF2LazyGenotypesDecoder.parse(). + + try { + hd.decoder.setRecordBytes((byte[]) data); + + for (final GenotypeBuilder gb : builders) { + gb.reset(true); + } + + for (int i = 0; i < fields; ++i) { + final String field = hd.getFieldDict().get((Integer) hd.decoder.decodeTypedValue()); + + final byte type = hd.decoder.readTypeDescriptor(); + final int numElems = hd.decoder.decodeNumberOfElements(type); + + hd.getGenoFieldDecoders() + .getDecoder(field) + .decode(alleles, field, hd.decoder, type, numElems, builders); + } + + final ArrayList genotypes = new ArrayList(builders.length); + for (final GenotypeBuilder gb : builders) { + genotypes.add(gb.make()); + } + + return new LazyGenotypesContext.LazyData( + genotypes, hd.getSampleNamesInOrder(), hd.getSampleNameToOffset()); + } catch (IOException e) { + throw new TribbleException("Unexpected IOException parsing genotypes data block", e); + } + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java b/src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java index 27e6fde..1b29426 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyParsingGenotypesContext.java @@ -25,11 +25,11 @@ import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.vcf.VCFHeader; -/** You need to call getParser().setHeader() here before trying to decode() a - * GenotypesContext in any VariantContext that came about via - * VariantContextWritable.readFields(). That includes calling - * VariantContext.fullyDecode() or almost any of the GenotypesContext methods. - * The RecordReader provided by VCFInputFormat does this for you. +/** + * You need to call getParser().setHeader() here before trying to decode() a GenotypesContext in any + * VariantContext that came about via VariantContextWritable.readFields(). That includes calling + * VariantContext.fullyDecode() or almost any of the GenotypesContext methods. The RecordReader + * provided by VCFInputFormat does this for you. */ // There's no public LazyGenotypesContext.LazyParser in Picard so we need to // provide our own. Since we need to have the header in the parser set @@ -38,24 +38,27 @@ // // And since VCF and BCF have different kinds of lazy data, we have separate // classes implementing the actual parsing for each. -public abstract class LazyParsingGenotypesContext - extends LazyGenotypesContext -{ - // super.parser is inaccessible to us so we keep a copy that we can access. - private final Parser parserCopy; - - protected LazyParsingGenotypesContext(Parser p, byte[] data, int count) { - super(p, data, count); - parserCopy = p; - } - - public Parser getParser() { return parserCopy; } - - public static interface HeaderDataCache { - public void setHeader(VCFHeader header); - } - - public static abstract class Parser implements LazyParser { - public abstract void setHeaderDataCache(HeaderDataCache data); - } +public abstract class LazyParsingGenotypesContext extends LazyGenotypesContext { + + // super.parser is inaccessible to us so we keep a copy that we can access. + private final Parser parserCopy; + + protected LazyParsingGenotypesContext(Parser p, byte[] data, int count) { + super(p, data, count); + parserCopy = p; + } + + public Parser getParser() { + return parserCopy; + } + + public static interface HeaderDataCache { + + public void setHeader(VCFHeader header); + } + + public abstract static class Parser implements LazyParser { + + public abstract void setHeaderDataCache(HeaderDataCache data); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java index ccb061c..3fca4a9 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java @@ -20,9 +20,6 @@ package org.seqdoop.hadoop_bam; -import java.io.UnsupportedEncodingException; -import java.util.List; - import htsjdk.tribble.readers.LineIterator; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.LazyGenotypesContext; @@ -30,99 +27,108 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderVersion; +import java.io.UnsupportedEncodingException; +import java.util.List; // File created: 2013-07-03 15:41:21 // The actual parsing is delegated to AbstractVCFCodec. public class LazyVCFGenotypesContext extends LazyParsingGenotypesContext { - /** Takes ownership of the given byte[]: don't modify its contents. */ - public LazyVCFGenotypesContext( - List alleles, String chrom, int start, - byte[] utf8Unparsed, int count) - { - super(new Parser(alleles, chrom, start), utf8Unparsed, count); - } - - public static class HeaderDataCache - implements LazyParsingGenotypesContext.HeaderDataCache - { - private HeaderSettableVCFCodec codec = new HeaderSettableVCFCodec(); - - @Override public void setHeader(VCFHeader header) { - VCFHeaderVersion version = null; - - // Normally AbstractVCFCodec parses the header and thereby sets the - // version field. It gets used later on so we need to set it. - for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { - if (VCFHeaderVersion.isFormatString(line.getKey())) { - version = VCFHeaderVersion.toHeaderVersion(line.getValue()); - break; - } - } - - codec.setHeaderAndVersion(header, version); - } - - public AbstractVCFCodec getCodec() { return codec; } - } - - public static class Parser extends LazyParsingGenotypesContext.Parser { - private HeaderSettableVCFCodec codec = null; - private final List alleles; - private final String chrom; - private final int start; - - public Parser(List alleles, String chrom, int start) { - this.alleles = alleles; - this.chrom = chrom; - this.start = start; - } - - @Override public void setHeaderDataCache( - LazyParsingGenotypesContext.HeaderDataCache data) - { - codec = (HeaderSettableVCFCodec)((HeaderDataCache)data).getCodec(); - } - - @Override public LazyGenotypesContext.LazyData parse(final Object data) { - if (codec == null || !codec.hasHeader()) - throw new IllegalStateException( - "Cannot decode genotypes without a codec with a VCFHeader"); - - final String str; - try { - str = new String((byte[])data, "UTF-8"); - } catch (UnsupportedEncodingException absurd) { - throw new RuntimeException( - "Can never happen on a compliant Java implementation because "+ - "UTF-8 is guaranteed to be supported"); - } - return codec.createGenotypeMap(str, alleles, chrom, start); - } - } + /** Takes ownership of the given byte[]: don't modify its contents. */ + public LazyVCFGenotypesContext( + List alleles, String chrom, int start, byte[] utf8Unparsed, int count) { + super(new Parser(alleles, chrom, start), utf8Unparsed, count); + } + + public static class HeaderDataCache implements LazyParsingGenotypesContext.HeaderDataCache { + + private HeaderSettableVCFCodec codec = new HeaderSettableVCFCodec(); + + @Override + public void setHeader(VCFHeader header) { + VCFHeaderVersion version = null; + + // Normally AbstractVCFCodec parses the header and thereby sets the + // version field. It gets used later on so we need to set it. + for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { + if (VCFHeaderVersion.isFormatString(line.getKey())) { + version = VCFHeaderVersion.toHeaderVersion(line.getValue()); + break; + } + } + + codec.setHeaderAndVersion(header, version); + } + + public AbstractVCFCodec getCodec() { + return codec; + } + } + + public static class Parser extends LazyParsingGenotypesContext.Parser { + + private final List alleles; + private final String chrom; + private final int start; + private HeaderSettableVCFCodec codec = null; + + public Parser(List alleles, String chrom, int start) { + this.alleles = alleles; + this.chrom = chrom; + this.start = start; + } + + @Override + public void setHeaderDataCache(LazyParsingGenotypesContext.HeaderDataCache data) { + codec = (HeaderSettableVCFCodec) ((HeaderDataCache) data).getCodec(); + } + + @Override + public LazyGenotypesContext.LazyData parse(final Object data) { + if (codec == null || !codec.hasHeader()) { + throw new IllegalStateException("Cannot decode genotypes without a codec with a VCFHeader"); + } + + final String str; + try { + str = new String((byte[]) data, "UTF-8"); + } catch (UnsupportedEncodingException absurd) { + throw new RuntimeException( + "Can never happen on a compliant Java implementation because " + + "UTF-8 is guaranteed to be supported"); + } + return codec.createGenotypeMap(str, alleles, chrom, start); + } + } } // This is a HACK. But, the functionality is only in AbstractVCFCodec so it // can't be helped. This is preferable to copying the functionality into // parse() above. class HeaderSettableVCFCodec extends AbstractVCFCodec { - public boolean hasHeader() { return header != null; } - - public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { - this.header = header; - this.version = ver; - } - - @Override public Object readActualHeader(LineIterator reader) { - throw new UnsupportedOperationException( - "Internal error: this shouldn't be called"); - } - @Override public List parseFilters(String filterString) { - throw new UnsupportedOperationException( - "Internal error: this shouldn't be called"); - } - @Override public boolean canDecode(String s) { - return true; - } + + public boolean hasHeader() { + return header != null; + } + + public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { + this.header = header; + this.version = ver; + } + + @Override + public Object readActualHeader(LineIterator reader) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } + + @Override + public List parseFilters(String filterString) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } + + @Override + public boolean canDecode(String s) { + return true; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LineReader.java b/src/main/java/org/seqdoop/hadoop_bam/LineReader.java index 1db1b85..545fc75 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LineReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LineReader.java @@ -24,17 +24,18 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; /** - * A class that provides a line reader from an input stream. - * This code started as the org.apache.hadoop.util.LineReader class in Hadoop 0.20.2, - * to which we added a skip(n) method. + * A class that provides a line reader from an input stream. This code started as the + * org.apache.hadoop.util.LineReader class in Hadoop 0.20.2, to which we added a skip(n) method. */ public class LineReader { + private static final int DEFAULT_BUFFER_SIZE = 64 * 1024; + private static final byte CR = '\r'; + private static final byte LF = '\n'; private int bufferSize = DEFAULT_BUFFER_SIZE; private InputStream in; private byte[] buffer; @@ -43,25 +44,20 @@ public class LineReader { // the current position in the buffer private int bufferPosn = 0; - private static final byte CR = '\r'; - private static final byte LF = '\n'; - /** - * Create a line reader that reads from the given stream using the - * default buffer-size (64k). + * Create a line reader that reads from the given stream using the default buffer-size (64k). + * * @param in The input stream - * @throws IOException */ public LineReader(InputStream in) { this(in, DEFAULT_BUFFER_SIZE); } /** - * Create a line reader that reads from the given stream using the - * given buffer-size. + * Create a line reader that reads from the given stream using the given buffer-size. + * * @param in The input stream * @param bufferSize Size of the read buffer - * @throws IOException */ public LineReader(InputStream in, int bufferSize) { this.in = in; @@ -70,46 +66,36 @@ public LineReader(InputStream in, int bufferSize) { } /** - * Create a line reader that reads from the given stream using the - * io.file.buffer.size specified in the given - * Configuration. + * Create a line reader that reads from the given stream using the io.file.buffer.size + * specified in the given Configuration. + * * @param in input stream * @param conf configuration - * @throws IOException */ public LineReader(InputStream in, Configuration conf) throws IOException { this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE)); } - /** - * Close the underlying stream. - * @throws IOException - */ + /** Close the underlying stream. */ public void close() throws IOException { in.close(); } /** - * Read one line from the InputStream into the given Text. A line - * can be terminated by one of the following: '\n' (LF) , '\r' (CR), - * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated - * line. + * Read one line from the InputStream into the given Text. A line can be terminated by one of the + * following: '\n' (LF) , '\r' (CR), or '\r\n' (CR+LF). EOF also terminates an otherwise + * unterminated line. * * @param str the object to store the given line (without newline) - * @param maxLineLength the maximum number of bytes to store into str; - * the rest of the line is silently discarded. - * @param maxBytesToConsume the maximum number of bytes to consume - * in this call. This is only a hint, because if the line cross - * this threshold, we allow it to happen. It can overshoot - * potentially by as much as one buffer length. - * - * @return the number of bytes read including the (longest) newline - * found. - * + * @param maxLineLength the maximum number of bytes to store into str; the rest of the line is + * silently discarded. + * @param maxBytesToConsume the maximum number of bytes to consume in this call. This is only a + * hint, because if the line cross this threshold, we allow it to happen. It can overshoot + * potentially by as much as one buffer length. + * @return the number of bytes read including the (longest) newline found. * @throws IOException if the underlying stream throws */ - public int readLine(Text str, int maxLineLength, - int maxBytesToConsume) throws IOException { + public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy @@ -127,35 +113,38 @@ public int readLine(Text str, int maxLineLength, * follows. */ str.clear(); - int txtLength = 0; //tracks str.getLength(), as an optimization - int newlineLength = 0; //length of terminating newline - boolean prevCharCR = false; //true of prev char was CR + int txtLength = 0; // tracks str.getLength(), as an optimization + int newlineLength = 0; // length of terminating newline + boolean prevCharCR = false; // true of prev char was CR long bytesConsumed = 0; do { - int startPosn = bufferPosn; //starting from where we left off the last time + int startPosn = bufferPosn; // starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; - if (prevCharCR) - ++bytesConsumed; //account for CR from previous read + if (prevCharCR) { + ++bytesConsumed; // account for CR from previous read + } bufferLength = in.read(buffer); - if (bufferLength <= 0) + if (bufferLength <= 0) { break; // EOF + } } - for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline + for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } - if (prevCharCR) { //CR + notLF, we are at notLF + if (prevCharCR) { // CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; - if (prevCharCR && newlineLength == 0) - --readLength; //CR at the end of the buffer + if (prevCharCR && newlineLength == 0) { + --readLength; // CR at the end of the buffer + } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { @@ -167,13 +156,15 @@ public int readLine(Text str, int maxLineLength, } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); - if (bytesConsumed > (long)Integer.MAX_VALUE) + if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); - return (int)bytesConsumed; + } + return (int) bytesConsumed; } /** * Read from the InputStream into the given Text. + * * @param str the object to store the given line * @param maxLineLength the maximum number of bytes to store into str. * @return the number of bytes read including the newline @@ -181,10 +172,11 @@ public int readLine(Text str, int maxLineLength, */ public int readLine(Text str, int maxLineLength) throws IOException { return readLine(str, maxLineLength, Integer.MAX_VALUE); -} + } /** * Read from the InputStream into the given Text. + * * @param str the object to store the given line * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws @@ -193,39 +185,37 @@ public int readLine(Text str) throws IOException { return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); } - /** - * Skip n bytes from the InputStream. - * @param n the number of bytes to skip. + /** + * Skip n bytes from the InputStream. + * + * @param n the number of bytes to skip. * @return the number of bytes skipped. * @throws IOException if the underlying stream throws. - */ - public long skip(long n) throws IOException - { - boolean end = false; - long toskip = n; - while (toskip > 0 && !end) - { - if (bufferPosn < bufferLength) - { - int skipped = (int)Math.min(bufferLength - bufferPosn, toskip); - bufferPosn += skipped; - toskip -= skipped; - } - if (bufferPosn >= bufferLength) - { - int loaded = loadBuffer(); - end = loaded == 0; - } - } - return n - toskip; - } + */ + public long skip(long n) throws IOException { + boolean end = false; + long toskip = n; + while (toskip > 0 && !end) { + if (bufferPosn < bufferLength) { + int skipped = (int) Math.min(bufferLength - bufferPosn, toskip); + bufferPosn += skipped; + toskip -= skipped; + } + if (bufferPosn >= bufferLength) { + int loaded = loadBuffer(); + end = loaded == 0; + } + } + return n - toskip; + } - protected int loadBuffer() throws IOException - { - bufferLength = in.read(buffer); - if (bufferLength < 0) // if EOF read returns -1 - bufferLength = 0; - bufferPosn = 0; - return bufferLength; - } + protected int loadBuffer() throws IOException { + bufferLength = in.read(buffer); + if (bufferLength < 0) // if EOF read returns -1 + { + bufferLength = 0; + } + bufferPosn = 0; + return bufferLength; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/QseqInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/QseqInputFormat.java index 1432671..3b392e0 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/QseqInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/QseqInputFormat.java @@ -25,7 +25,6 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharacterCodingException; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -39,405 +38,407 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; - import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding; import org.seqdoop.hadoop_bam.util.ConfHelper; /** - * Reads the Illumina qseq sequence format. - * Key: instrument, run number, lane, tile, xpos, ypos, read number, delimited by ':' characters. - * Value: a SequencedFragment object representing the entry. + * Reads the Illumina qseq sequence format. Key: instrument, run number, lane, tile, xpos, ypos, + * read number, delimited by ':' characters. Value: a SequencedFragment object representing the + * entry. */ -public class QseqInputFormat extends FileInputFormat -{ - public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-input.base-quality-encoding"; - public static final String CONF_FILTER_FAILED_QC = "hbam.qseq-input.filter-failed-qc"; - public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina"; - - public static class QseqRecordReader extends RecordReader - { - /* - * qseq format: - * 11 tab-separated columns - * - * 1) Instrument - * 2) Run id - * 3) Lane number - * 4) Tile number - * 5) X pos - * 6) Y pos - * 7) Index sequence (0 for runs without multiplexing) - * 8) Read Number - * 9) Base Sequence - * 10) Base Quality - * 11) Filter: did the read pass filtering? 0 - No, 1 - Yes. - */ - // start: first valid data index - private long start; - // end: first index value beyond the slice, i.e. slice is in range [start,end) - private long end; - // pos: current position in file - private long pos; - // file: the file being read - private Path file; - - private LineReader lineReader; - private InputStream inputStream; - private Text currentKey = new Text(); - private SequencedFragment currentValue = new SequencedFragment(); - - private Text buffer = new Text(); - private static final int NUM_QSEQ_COLS = 11; - // for these, we have one per qseq field - private int[] fieldPositions = new int[NUM_QSEQ_COLS]; - private int[] fieldLengths = new int[NUM_QSEQ_COLS]; - - private BaseQualityEncoding qualityEncoding; - private boolean filterFailedQC = false; - - private static final String Delim = "\t"; - - // How long can a qseq line get? - public static final int MAX_LINE_LENGTH = 20000; - - public QseqRecordReader(Configuration conf, FileSplit split) throws IOException - { - setConf(conf); - file = split.getPath(); - start = split.getStart(); - end = start + split.getLength(); - - FileSystem fs = file.getFileSystem(conf); - FSDataInputStream fileIn = fs.open(file); - - CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); - CompressionCodec codec = codecFactory.getCodec(file); - - if (codec == null) // no codec. Uncompressed file. - { - positionAtFirstRecord(fileIn); - inputStream = fileIn; - } - else - { // compressed file - if (start != 0) - throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); - - inputStream = codec.createInputStream(fileIn); - end = Long.MAX_VALUE; // read until the end of the file - } - - lineReader = new LineReader(inputStream); - } - - /* - * Position the input stream at the start of the first record. - */ - private void positionAtFirstRecord(FSDataInputStream stream) throws IOException - { - if (start > 0) - { - // Advance to the start of the first line in our slice. - // We use a temporary LineReader to read a partial line and find the - // start of the first one on or after our starting position. - // In case our slice starts right at the beginning of a line, we need to back - // up by one position and then discard the first line. - start -= 1; - stream.seek(start); - LineReader reader = new LineReader(stream); - int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); - start = start + bytesRead; - stream.seek(start); - } - // else - // if start == 0 we're starting at the beginning of a line - pos = start; - } - - protected void setConf(Configuration conf) - { - String encoding = - conf.get(QseqInputFormat.CONF_BASE_QUALITY_ENCODING, - conf.get(FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, - CONF_BASE_QUALITY_ENCODING_DEFAULT)); - - if ("illumina".equals(encoding)) - qualityEncoding = BaseQualityEncoding.Illumina; - else if ("sanger".equals(encoding)) - qualityEncoding = BaseQualityEncoding.Sanger; - else - throw new RuntimeException("Unknown input base quality encoding value " + encoding); - - filterFailedQC = ConfHelper.parseBoolean( - conf.get(QseqInputFormat.CONF_FILTER_FAILED_QC, - conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), - false); - } - - /** - * Added to use mapreduce API. - */ - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException - { - } - - /** - * Added to use mapreduce API. - */ - public Text getCurrentKey() - { - return currentKey; - } - - /** - * Added to use mapreduce API. - */ - public SequencedFragment getCurrentValue() - { - return currentValue; - } - - /** - * Added to use mapreduce API. - */ - public boolean nextKeyValue() throws IOException, InterruptedException - { - return next(currentKey, currentValue); - } - - /** - * Close this RecordReader to future operations. - */ - public void close() throws IOException - { - inputStream.close(); - } - - /** - * Create an object of the appropriate type to be used as a key. - */ - public Text createKey() - { - return new Text(); - } - - /** - * Create an object of the appropriate type to be used as a value. - */ - public SequencedFragment createValue() - { - return new SequencedFragment(); - } - - /** - * Returns the current position in the input. - */ - public long getPos() { return pos; } - - /** - * How much of the input has the RecordReader consumed i.e. - */ - public float getProgress() - { - if (start == end) - return 1.0f; - else - return Math.min(1.0f, (pos - start) / (float)(end - start)); - } - - public String makePositionMessage(long pos) - { - return file.toString() + ":" + pos; - } - - public String makePositionMessage() - { - return file.toString() + ":" + pos; - } - - /* - * Read a single record. - * - * Reads a single line of input and scans it with scanQseqLine, which - * sets key and value accordingly. The method updates this.pos. - * - * @return The number of bytes read. If no bytes were read, the EOF was reached. - */ - private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException - { - int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); - pos += bytesRead; - if (bytesRead >= MAX_LINE_LENGTH) - { - String line; - try { - line = Text.decode(buffer.getBytes(), 0, 500); - } catch (java.nio.charset.CharacterCodingException e) { - line = "(line not convertible to printable format)"; - } - throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + - makePositionMessage(pos - bytesRead) + ": " + line); - } - else if (bytesRead > 0) - scanQseqLine(buffer, key, value); - - return bytesRead; - } - - /** - * Reads the next key/value pair from the input for processing. - */ - public boolean next(Text key, SequencedFragment value) throws IOException - { - if (pos >= end) - return false; // past end of slice - - int bytesRead = 0; - boolean goodRecord; - do { - bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached - goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); - } while (bytesRead > 0 && !goodRecord); - - if (goodRecord) // post process the record only if it's going to be used - { - try { - postProcessSequencedFragment(value); - } catch (FormatException e) { - throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + - "; line: " + buffer); // last line read is still in the buffer - } - } - - return goodRecord; - } - - /* - * Scans the text line to find the position and the lengths of the fields - * within it. The positions and lengths are saved into the instance arrays - * 'fieldPositions' and 'fieldLengths'. - * - * @exception FormatException Line doesn't have the expected number of fields. - */ - private void setFieldPositionsAndLengths(Text line) - { - int pos = 0; // the byte position within the record - int fieldno = 0; // the field index within the record - while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field - { - int endpos = line.find(Delim, pos); // the field's end position - if (endpos < 0) - endpos = line.getLength(); - - fieldPositions[fieldno] = pos; - fieldLengths[fieldno] = endpos - pos; - - pos = endpos + 1; // the next starting position is the current end + 1 - fieldno += 1; - } - - if (fieldno != NUM_QSEQ_COLS) - throw new FormatException("found " + fieldno + " fields instead of 11 at " + - makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); - } - - private void scanQseqLine(Text line, Text key, SequencedFragment fragment) - { - setFieldPositionsAndLengths(line); - - // Build the key. We concatenate all fields from 0 to 5 (machine to y-pos) - // and then the read number, replacing the tabs with colons. - key.clear(); - // append up and including field[5] - key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]); - // replace tabs with : - byte[] bytes = key.getBytes(); - int temporaryEnd = key.getLength(); - for (int i = 0; i < temporaryEnd; ++i) - if (bytes[i] == '\t') - bytes[i] = ':'; - // append the read number - key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab. - // convert the tab preceding the read number into a : - key.getBytes()[temporaryEnd] = ':'; - - // now the fragment - try - { - fragment.clear(); - fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) ); - fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) ); - //fragment.setFlowcellId(); - fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) ); - fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) ); - fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) ); - fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) ); - fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) ); - fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' ); - //fragment.setControlNumber(); - if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence - fragment.setIndexSequence(null); - else - fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); - } - catch (CharacterCodingException e) { - throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line); - } - - fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); - fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]); - } - - /* - * This method applies some transformations to the read and quality data. - * - *

    - *
  • '.' in the read are converted to 'N'
  • - *
  • the base quality encoding is converted to 'sanger', unless otherwise - * requested by the configuration.
  • - *
- * - * @exception FormatException Thrown if the record contains base quality scores - * outside the range allowed by the format. - */ - private void postProcessSequencedFragment(SequencedFragment fragment) - { - byte[] bytes = fragment.getSequence().getBytes(); - // replace . with N - for (int i = 0; i < fieldLengths[8]; ++i) - if (bytes[i] == '.') - bytes[i] = 'N'; - - if (qualityEncoding == BaseQualityEncoding.Illumina) - { - // convert illumina to sanger scale - SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); - } - else // sanger qualities. - { - int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); - if (outOfRangeElement >= 0) - { - throw new FormatException("qseq base quality score out of range for Sanger Phred+33 format (found " + - (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + - "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); - } - } - } - } - - @Override - public boolean isSplitable(JobContext context, Path path) - { - CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); - return codec == null; - } - - public RecordReader createRecordReader( - InputSplit genericSplit, - TaskAttemptContext context) throws IOException, InterruptedException - { - context.setStatus(genericSplit.toString()); - return new QseqRecordReader(context.getConfiguration(), (FileSplit)genericSplit); // cast as per example in TextInputFormat - } +public class QseqInputFormat extends FileInputFormat { + + public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-input.base-quality-encoding"; + public static final String CONF_FILTER_FAILED_QC = "hbam.qseq-input.filter-failed-qc"; + public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina"; + + @Override + public boolean isSplitable(JobContext context, Path path) { + CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); + return codec == null; + } + + public RecordReader createRecordReader( + InputSplit genericSplit, TaskAttemptContext context) + throws IOException, InterruptedException { + context.setStatus(genericSplit.toString()); + return new QseqRecordReader( + context.getConfiguration(), + (FileSplit) genericSplit); // cast as per example in TextInputFormat + } + + public static class QseqRecordReader extends RecordReader { + + // How long can a qseq line get? + public static final int MAX_LINE_LENGTH = 20000; + private static final int NUM_QSEQ_COLS = 11; + private static final String Delim = "\t"; + /* + * qseq format: + * 11 tab-separated columns + * + * 1) Instrument + * 2) Run id + * 3) Lane number + * 4) Tile number + * 5) X pos + * 6) Y pos + * 7) Index sequence (0 for runs without multiplexing) + * 8) Read Number + * 9) Base Sequence + * 10) Base Quality + * 11) Filter: did the read pass filtering? 0 - No, 1 - Yes. + */ + // start: first valid data index + private long start; + // end: first index value beyond the slice, i.e. slice is in range [start,end) + private long end; + // pos: current position in file + private long pos; + // file: the file being read + private Path file; + private LineReader lineReader; + private InputStream inputStream; + private Text currentKey = new Text(); + private SequencedFragment currentValue = new SequencedFragment(); + private Text buffer = new Text(); + // for these, we have one per qseq field + private int[] fieldPositions = new int[NUM_QSEQ_COLS]; + private int[] fieldLengths = new int[NUM_QSEQ_COLS]; + private BaseQualityEncoding qualityEncoding; + private boolean filterFailedQC = false; + + public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { + setConf(conf); + file = split.getPath(); + start = split.getStart(); + end = start + split.getLength(); + + FileSystem fs = file.getFileSystem(conf); + FSDataInputStream fileIn = fs.open(file); + + CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); + CompressionCodec codec = codecFactory.getCodec(file); + + if (codec == null) // no codec. Uncompressed file. + { + positionAtFirstRecord(fileIn); + inputStream = fileIn; + } else { // compressed file + if (start != 0) { + throw new RuntimeException( + "Start position for compressed file is not 0! (found " + start + ")"); + } + + inputStream = codec.createInputStream(fileIn); + end = Long.MAX_VALUE; // read until the end of the file + } + + lineReader = new LineReader(inputStream); + } + + /* + * Position the input stream at the start of the first record. + */ + private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { + if (start > 0) { + // Advance to the start of the first line in our slice. + // We use a temporary LineReader to read a partial line and find the + // start of the first one on or after our starting position. + // In case our slice starts right at the beginning of a line, we need to back + // up by one position and then discard the first line. + start -= 1; + stream.seek(start); + LineReader reader = new LineReader(stream); + int bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); + start = start + bytesRead; + stream.seek(start); + } + // else + // if start == 0 we're starting at the beginning of a line + pos = start; + } + + protected void setConf(Configuration conf) { + String encoding = + conf.get( + QseqInputFormat.CONF_BASE_QUALITY_ENCODING, + conf.get( + FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, + CONF_BASE_QUALITY_ENCODING_DEFAULT)); + + if ("illumina".equals(encoding)) { + qualityEncoding = BaseQualityEncoding.Illumina; + } else if ("sanger".equals(encoding)) { + qualityEncoding = BaseQualityEncoding.Sanger; + } else { + throw new RuntimeException("Unknown input base quality encoding value " + encoding); + } + + filterFailedQC = + ConfHelper.parseBoolean( + conf.get( + QseqInputFormat.CONF_FILTER_FAILED_QC, + conf.get(FormatConstants.CONF_INPUT_FILTER_FAILED_QC)), + false); + } + + /** Added to use mapreduce API. */ + public void initialize(InputSplit split, TaskAttemptContext context) + throws IOException, InterruptedException {} + + /** Added to use mapreduce API. */ + public Text getCurrentKey() { + return currentKey; + } + + /** Added to use mapreduce API. */ + public SequencedFragment getCurrentValue() { + return currentValue; + } + + /** Added to use mapreduce API. */ + public boolean nextKeyValue() throws IOException, InterruptedException { + return next(currentKey, currentValue); + } + + /** Close this RecordReader to future operations. */ + public void close() throws IOException { + inputStream.close(); + } + + /** Create an object of the appropriate type to be used as a key. */ + public Text createKey() { + return new Text(); + } + + /** Create an object of the appropriate type to be used as a value. */ + public SequencedFragment createValue() { + return new SequencedFragment(); + } + + /** Returns the current position in the input. */ + public long getPos() { + return pos; + } + + /** How much of the input has the RecordReader consumed i.e. */ + public float getProgress() { + if (start == end) { + return 1.0f; + } else { + return Math.min(1.0f, (pos - start) / (float) (end - start)); + } + } + + public String makePositionMessage(long pos) { + return file.toString() + ":" + pos; + } + + public String makePositionMessage() { + return file.toString() + ":" + pos; + } + + /* + * Read a single record. + * + * Reads a single line of input and scans it with scanQseqLine, which + * sets key and value accordingly. The method updates this.pos. + * + * @return The number of bytes read. If no bytes were read, the EOF was reached. + */ + private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException { + int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); + pos += bytesRead; + if (bytesRead >= MAX_LINE_LENGTH) { + String line; + try { + line = Text.decode(buffer.getBytes(), 0, 500); + } catch (java.nio.charset.CharacterCodingException e) { + line = "(line not convertible to printable format)"; + } + throw new RuntimeException( + "found abnormally large line (length " + + bytesRead + + ") at " + + makePositionMessage(pos - bytesRead) + + ": " + + line); + } else if (bytesRead > 0) { + scanQseqLine(buffer, key, value); + } + + return bytesRead; + } + + /** Reads the next key/value pair from the input for processing. */ + public boolean next(Text key, SequencedFragment value) throws IOException { + if (pos >= end) { + return false; // past end of slice + } + + int bytesRead = 0; + boolean goodRecord; + do { + bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached + goodRecord = + (bytesRead > 0) + && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); + } while (bytesRead > 0 && !goodRecord); + + if (goodRecord) // post process the record only if it's going to be used + { + try { + postProcessSequencedFragment(value); + } catch (FormatException e) { + throw new FormatException( + e.getMessage() + + " Position: " + + makePositionMessage(this.pos - bytesRead) + + "; line: " + + buffer); // last line read is still in the buffer + } + } + + return goodRecord; + } + + /* + * Scans the text line to find the position and the lengths of the fields + * within it. The positions and lengths are saved into the instance arrays + * 'fieldPositions' and 'fieldLengths'. + * + * @exception FormatException Line doesn't have the expected number of fields. + */ + private void setFieldPositionsAndLengths(Text line) { + int pos = 0; // the byte position within the record + int fieldno = 0; // the field index within the record + while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field + { + int endpos = line.find(Delim, pos); // the field's end position + if (endpos < 0) { + endpos = line.getLength(); + } + + fieldPositions[fieldno] = pos; + fieldLengths[fieldno] = endpos - pos; + + pos = endpos + 1; // the next starting position is the current end + 1 + fieldno += 1; + } + + if (fieldno != NUM_QSEQ_COLS) { + throw new FormatException( + "found " + + fieldno + + " fields instead of 11 at " + + makePositionMessage(this.pos - line.getLength()) + + ". Line: " + + line); + } + } + + private void scanQseqLine(Text line, Text key, SequencedFragment fragment) { + setFieldPositionsAndLengths(line); + + // Build the key. We concatenate all fields from 0 to 5 (machine to y-pos) + // and then the read number, replacing the tabs with colons. + key.clear(); + // append up and including field[5] + key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]); + // replace tabs with : + byte[] bytes = key.getBytes(); + int temporaryEnd = key.getLength(); + for (int i = 0; i < temporaryEnd; ++i) { + if (bytes[i] == '\t') { + bytes[i] = ':'; + } + } + // append the read number + key.append( + line.getBytes(), + fieldPositions[7] - 1, + fieldLengths[7] + 1); // +/- 1 to catch the preceding tab. + // convert the tab preceding the read number into a : + key.getBytes()[temporaryEnd] = ':'; + + // now the fragment + try { + fragment.clear(); + fragment.setInstrument(Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0])); + fragment.setRunNumber( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1]))); + // fragment.setFlowcellId(); + fragment.setLane( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2]))); + fragment.setTile( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3]))); + fragment.setXpos( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4]))); + fragment.setYpos( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5]))); + fragment.setRead( + Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7]))); + fragment.setFilterPassed(line.getBytes()[fieldPositions[10]] != '0'); + // fragment.setControlNumber(); + if (fieldLengths[6] > 0 + && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence + { + fragment.setIndexSequence(null); + } else { + fragment.setIndexSequence( + Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); + } + } catch (CharacterCodingException e) { + throw new FormatException( + "Invalid character format at " + + makePositionMessage(this.pos - line.getLength()) + + "; line: " + + line); + } + + fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); + fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]); + } + + /* + * This method applies some transformations to the read and quality data. + * + *
    + *
  • '.' in the read are converted to 'N'
  • + *
  • the base quality encoding is converted to 'sanger', unless otherwise + * requested by the configuration.
  • + *
+ * + * @exception FormatException Thrown if the record contains base quality scores + * outside the range allowed by the format. + */ + private void postProcessSequencedFragment(SequencedFragment fragment) { + byte[] bytes = fragment.getSequence().getBytes(); + // replace . with N + for (int i = 0; i < fieldLengths[8]; ++i) { + if (bytes[i] == '.') { + bytes[i] = 'N'; + } + } + + if (qualityEncoding == BaseQualityEncoding.Illumina) { + // convert illumina to sanger scale + SequencedFragment.convertQuality( + fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); + } else // sanger qualities. + { + int outOfRangeElement = + SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); + if (outOfRangeElement >= 0) { + throw new FormatException( + "qseq base quality score out of range for Sanger Phred+33 format (found " + + (fragment.getQuality().getBytes()[outOfRangeElement] + - FormatConstants.SANGER_OFFSET) + + ").\n" + + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); + } + } + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/QseqOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/QseqOutputFormat.java index c54031d..1e01701 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/QseqOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/QseqOutputFormat.java @@ -23,10 +23,9 @@ package org.seqdoop.hadoop_bam; import java.io.DataOutputStream; -import java.io.OutputStream; import java.io.IOException; +import java.io.OutputStream; import java.nio.ByteBuffer; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -38,159 +37,159 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; - import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding; /** - * Output format for Illumina qseq format. - * Records are lines of tab-separated fields. Each record consists of - * - Machine name - * - Run number - * - Lane number - * - Tile number - * - X coordinate of the spot. Integer (can be negative). - * - Y coordinate of the spot. Integer (can be negative). - * - Index - * - Read Number - * - Sequence - * - Quality - * - Filter + * Output format for Illumina qseq format. Records are lines of tab-separated fields. Each record + * consists of - Machine name - Run number - Lane number - Tile number - X coordinate of the spot. + * Integer (can be negative). - Y coordinate of the spot. Integer (can be negative). - Index - Read + * Number - Sequence - Quality - Filter */ -public class QseqOutputFormat extends TextOutputFormat -{ - public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-output.base-quality-encoding"; - public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina"; - - public static class QseqRecordWriter extends RecordWriter - { - static final byte[] newLine; - static final String delim = "\t"; - static { - try { - newLine = "\n".getBytes("us-ascii"); - } catch (java.io.UnsupportedEncodingException e) { - throw new RuntimeException("us-ascii encoding not supported!"); - } - } - - protected StringBuilder sBuilder = new StringBuilder(800); - protected OutputStream out; - BaseQualityEncoding baseQualityFormat; - - public QseqRecordWriter(Configuration conf, OutputStream out) - { - baseQualityFormat = BaseQualityEncoding.Illumina; - this.out = out; - setConf(conf); - } - - public void setConf(Configuration conf) - { - String setting = conf.get(CONF_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT); - if ("illumina".equals(setting)) - baseQualityFormat = BaseQualityEncoding.Illumina; - else if ("sanger".equals(setting)) - baseQualityFormat = BaseQualityEncoding.Sanger; - else - throw new RuntimeException("Invalid property value '" + setting + "' for " + CONF_BASE_QUALITY_ENCODING + ". Valid values are 'illumina' or 'sanger'"); - } - - public void write(Text ignored_key, SequencedFragment seq) throws IOException - { - sBuilder.delete(0, sBuilder.length()); // clear - - sBuilder.append( seq.getInstrument() == null ? "" : seq.getInstrument() ).append(delim); - sBuilder.append( seq.getRunNumber() == null ? "" : seq.getRunNumber().toString() ).append(delim); - sBuilder.append( seq.getLane() == null ? "" : seq.getLane().toString() ).append(delim); - sBuilder.append( seq.getTile() == null ? "" : seq.getTile().toString() ).append(delim); - sBuilder.append( seq.getXpos() == null ? "" : seq.getXpos().toString() ).append(delim); - sBuilder.append( seq.getYpos() == null ? "" : seq.getYpos().toString() ).append(delim); - - String index; - if (seq.getIndexSequence() == null || seq.getIndexSequence().isEmpty()) - index = "0"; - else - index = seq.getIndexSequence().replace('N', '.'); - sBuilder.append( index ).append(delim); - - sBuilder.append( seq.getRead() == null ? "" : seq.getRead().toString() ).append(delim); - // here we also replace 'N' with '.' - sBuilder.append( seq.getSequence() == null ? "" : seq.getSequence().toString().replace('N', '.')).append(delim); - - //////// quality may have to be re-coded - if (seq.getQuality() == null) - sBuilder.append(""); - else - { - int startPos = sBuilder.length(); - sBuilder.append(seq.getQuality().toString()); - if (baseQualityFormat == BaseQualityEncoding.Sanger) - { - // do nothing - } - else if (baseQualityFormat == BaseQualityEncoding.Illumina) - { - // recode the quality in-place - for (int i = startPos; i < sBuilder.length(); ++i) - { - // cast to avoid warning about possible loss of precision for assigning a char from an int. - char newValue = (char)(sBuilder.charAt(i) + 31); // 64 - 33 = 31: difference between illumina and sanger encoding - if (newValue > 126) - throw new RuntimeException("output quality score over allowed range. Maybe you meant to write in Sanger format?"); - sBuilder.setCharAt(i, newValue); - } - } - else - throw new RuntimeException("BUG! Unknown base quality format value " + baseQualityFormat + " in QseqRecordWriter"); - } - sBuilder.append(delim); - ///////// - sBuilder.append((seq.getFilterPassed() == null || seq.getFilterPassed() ) ? 1 : 0); - - try { - ByteBuffer buf = Text.encode(sBuilder.toString()); - out.write(buf.array(), 0, buf.limit()); - } catch (java.nio.charset.CharacterCodingException e) { - throw new RuntimeException("Error encoding qseq record: " + seq); - } - out.write(newLine, 0, newLine.length); - } - - public void close(TaskAttemptContext context) throws IOException - { - out.close(); - } - } - - public RecordWriter getRecordWriter(TaskAttemptContext task) - throws IOException - { - Configuration conf = task.getConfiguration(); - boolean isCompressed = getCompressOutput(task); - - CompressionCodec codec = null; - String extension = ""; - - if (isCompressed) - { - Class codecClass = getOutputCompressorClass(task, GzipCodec.class); - codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); - extension = codec.getDefaultExtension(); - } - - Path file = getDefaultWorkFile(task, extension); - FileSystem fs = file.getFileSystem(conf); - - OutputStream output; - - if (isCompressed) - { - FSDataOutputStream fileOut = fs.create(file, false); - output = new DataOutputStream(codec.createOutputStream(fileOut)); - } - else - output = fs.create(file, false); - - return new QseqRecordWriter(conf, output); - } +public class QseqOutputFormat extends TextOutputFormat { + + public static final String CONF_BASE_QUALITY_ENCODING = "hbam.qseq-output.base-quality-encoding"; + public static final String CONF_BASE_QUALITY_ENCODING_DEFAULT = "illumina"; + + public RecordWriter getRecordWriter(TaskAttemptContext task) + throws IOException { + Configuration conf = task.getConfiguration(); + boolean isCompressed = getCompressOutput(task); + + CompressionCodec codec = null; + String extension = ""; + + if (isCompressed) { + Class codecClass = + getOutputCompressorClass(task, GzipCodec.class); + codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); + extension = codec.getDefaultExtension(); + } + + Path file = getDefaultWorkFile(task, extension); + FileSystem fs = file.getFileSystem(conf); + + OutputStream output; + + if (isCompressed) { + FSDataOutputStream fileOut = fs.create(file, false); + output = new DataOutputStream(codec.createOutputStream(fileOut)); + } else { + output = fs.create(file, false); + } + + return new QseqRecordWriter(conf, output); + } + + public static class QseqRecordWriter extends RecordWriter { + + static final byte[] newLine; + static final String delim = "\t"; + + static { + try { + newLine = "\n".getBytes("us-ascii"); + } catch (java.io.UnsupportedEncodingException e) { + throw new RuntimeException("us-ascii encoding not supported!"); + } + } + + protected StringBuilder sBuilder = new StringBuilder(800); + protected OutputStream out; + BaseQualityEncoding baseQualityFormat; + + public QseqRecordWriter(Configuration conf, OutputStream out) { + baseQualityFormat = BaseQualityEncoding.Illumina; + this.out = out; + setConf(conf); + } + + public void setConf(Configuration conf) { + String setting = conf.get(CONF_BASE_QUALITY_ENCODING, CONF_BASE_QUALITY_ENCODING_DEFAULT); + if ("illumina".equals(setting)) { + baseQualityFormat = BaseQualityEncoding.Illumina; + } else if ("sanger".equals(setting)) { + baseQualityFormat = BaseQualityEncoding.Sanger; + } else { + throw new RuntimeException( + "Invalid property value '" + + setting + + "' for " + + CONF_BASE_QUALITY_ENCODING + + ". Valid values are 'illumina' or 'sanger'"); + } + } + + public void write(Text ignored_key, SequencedFragment seq) throws IOException { + sBuilder.delete(0, sBuilder.length()); // clear + + sBuilder.append(seq.getInstrument() == null ? "" : seq.getInstrument()).append(delim); + sBuilder + .append(seq.getRunNumber() == null ? "" : seq.getRunNumber().toString()) + .append(delim); + sBuilder.append(seq.getLane() == null ? "" : seq.getLane().toString()).append(delim); + sBuilder.append(seq.getTile() == null ? "" : seq.getTile().toString()).append(delim); + sBuilder.append(seq.getXpos() == null ? "" : seq.getXpos().toString()).append(delim); + sBuilder.append(seq.getYpos() == null ? "" : seq.getYpos().toString()).append(delim); + + String index; + if (seq.getIndexSequence() == null || seq.getIndexSequence().isEmpty()) { + index = "0"; + } else { + index = seq.getIndexSequence().replace('N', '.'); + } + sBuilder.append(index).append(delim); + + sBuilder.append(seq.getRead() == null ? "" : seq.getRead().toString()).append(delim); + // here we also replace 'N' with '.' + sBuilder + .append(seq.getSequence() == null ? "" : seq.getSequence().toString().replace('N', '.')) + .append(delim); + + //////// quality may have to be re-coded + if (seq.getQuality() == null) { + sBuilder.append(""); + } else { + int startPos = sBuilder.length(); + sBuilder.append(seq.getQuality().toString()); + if (baseQualityFormat == BaseQualityEncoding.Sanger) { + // do nothing + } else if (baseQualityFormat == BaseQualityEncoding.Illumina) { + // recode the quality in-place + for (int i = startPos; i < sBuilder.length(); ++i) { + // cast to avoid warning about possible loss of precision for assigning a char from an + // int. + char newValue = + (char) + (sBuilder.charAt(i) + + 31); // 64 - 33 = 31: difference between illumina and sanger encoding + if (newValue > 126) { + throw new RuntimeException( + "output quality score over allowed range. Maybe you meant to write in Sanger format?"); + } + sBuilder.setCharAt(i, newValue); + } + } else { + throw new RuntimeException( + "BUG! Unknown base quality format value " + + baseQualityFormat + + " in QseqRecordWriter"); + } + } + sBuilder.append(delim); + ///////// + sBuilder.append((seq.getFilterPassed() == null || seq.getFilterPassed()) ? 1 : 0); + + try { + ByteBuffer buf = Text.encode(sBuilder.toString()); + out.write(buf.array(), 0, buf.limit()); + } catch (java.nio.charset.CharacterCodingException e) { + throw new RuntimeException("Error encoding qseq record: " + seq); + } + out.write(newLine, 0, newLine.length); + } + + public void close(TaskAttemptContext context) throws IOException { + out.close(); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java b/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java index 5455288..ee52ec1 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java +++ b/src/main/java/org/seqdoop/hadoop_bam/ReferenceFragment.java @@ -22,130 +22,132 @@ package org.seqdoop.hadoop_bam; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; -import java.io.IOException; -import java.io.DataInput; -import java.io.DataOutput; - // partly based on SequencedFragment -// note: this class is supposed to represent a single line of a fasta input file, augmented by chromosome/contig name and start position - -public class ReferenceFragment implements Writable -{ - protected Text sequence = new Text(); - - protected Integer position; - protected String indexSequence; - - public void clear() - { - sequence.clear(); - indexSequence = null; - position = null; +// note: this class is supposed to represent a single line of a fasta input file, augmented by +// chromosome/contig name and start position + +public class ReferenceFragment implements Writable { + + protected Text sequence = new Text(); + + protected Integer position; + protected String indexSequence; + + public void clear() { + sequence.clear(); + indexSequence = null; + position = null; + } + + /** + * Get sequence Text object. Trade encapsulation for efficiency. Here we expose the internal Text + * object so that data may be read and written diretly from/to it. + * + *

Sequence should always be written using CAPITAL letters and 'N' for unknown bases. + */ + public Text getSequence() { + return sequence; + } + + public void setSequence(Text seq) { + if (seq == null) { + throw new IllegalArgumentException("can't have a null sequence"); } - - /** - * Get sequence Text object. - * Trade encapsulation for efficiency. Here we expose the internal Text - * object so that data may be read and written diretly from/to it. - * - * Sequence should always be written using CAPITAL letters and 'N' for unknown bases. - */ - public Text getSequence() { return sequence; } - - /** - * Get quality Text object. - * Trade encapsulation for efficiency. Here we expose the internal Text - * object so that data may be read and written diretly from/to it. - * - */ - public void setPosition(Integer pos) { - if (pos == null) - throw new IllegalArgumentException("can't have null reference position"); - position = pos; + sequence = seq; + } + + public Integer getPosition() { + return position; + } + + /** + * Get quality Text object. Trade encapsulation for efficiency. Here we expose the internal Text + * object so that data may be read and written diretly from/to it. + */ + public void setPosition(Integer pos) { + if (pos == null) { + throw new IllegalArgumentException("can't have null reference position"); } + position = pos; + } - public void setIndexSequence(String v) { - if (v == null) - throw new IllegalArgumentException("can't have null index sequence"); - indexSequence = v; - } + public String getIndexSequence() { + return indexSequence; + } - public void setSequence(Text seq) - { - if (seq == null) - throw new IllegalArgumentException("can't have a null sequence"); - sequence = seq; + public void setIndexSequence(String v) { + if (v == null) { + throw new IllegalArgumentException("can't have null index sequence"); } - - public Integer getPosition() { return position; } - public String getIndexSequence() { return indexSequence; } - - /** - * Recreates a pseudo fasta record with the fields available. - */ - public String toString() - { - String delim = "\t"; - StringBuilder builder = new StringBuilder(800); - builder.append(indexSequence).append(delim); - builder.append(position).append(delim); - builder.append(sequence); - return builder.toString(); + indexSequence = v; + } + + /** Recreates a pseudo fasta record with the fields available. */ + public String toString() { + String delim = "\t"; + StringBuilder builder = new StringBuilder(800); + builder.append(indexSequence).append(delim); + builder.append(position).append(delim); + builder.append(sequence); + return builder.toString(); + } + + public boolean equals(Object other) { + if (other != null && other instanceof ReferenceFragment) { + ReferenceFragment otherFrag = (ReferenceFragment) other; + + if (position == null && otherFrag.position != null + || position != null && !position.equals(otherFrag.position)) { + return false; + } + if (indexSequence == null && otherFrag.indexSequence != null + || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence)) { + return false; + } + // sequence can't be null + if (!sequence.equals(otherFrag.sequence)) { + return false; + } + + return true; + } else { + return false; } + } - public boolean equals(Object other) - { - if (other != null && other instanceof ReferenceFragment) - { - ReferenceFragment otherFrag = (ReferenceFragment)other; - - if (position == null && otherFrag.position != null || position != null && !position.equals(otherFrag.position)) - return false; - if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence)) - return false; - // sequence can't be null - if (!sequence.equals(otherFrag.sequence)) - return false; - - return true; - } - else - return false; - } + @Override + public int hashCode() { + int result = sequence.hashCode(); + result = 31 * result + (position != null ? position.hashCode() : 0); + result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0); + return result; + } - @Override - public int hashCode() { - int result = sequence.hashCode(); - result = 31 * result + (position != null ? position.hashCode() : 0); - result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0); - return result; - } + public void readFields(DataInput in) throws IOException { + // serialization order: + // 1) sequence + // 2) indexSequence (chromosome/contig name) + // 3) position of first base in this line of the fasta file - public void readFields(DataInput in) throws IOException - { - // serialization order: - // 1) sequence - // 2) indexSequence (chromosome/contig name) - // 3) position of first base in this line of the fasta file + this.clear(); - this.clear(); + sequence.readFields(in); - sequence.readFields(in); + indexSequence = WritableUtils.readString(in); + position = WritableUtils.readVInt(in); + } - indexSequence = WritableUtils.readString(in); - position = WritableUtils.readVInt(in); - } - - public void write(DataOutput out) throws IOException - { - sequence.write(out); + public void write(DataOutput out) throws IOException { + sequence.write(out); - WritableUtils.writeString(out, indexSequence); - WritableUtils.writeVInt(out, position); - - } + WritableUtils.writeString(out, indexSequence); + WritableUtils.writeVInt(out, position); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java b/src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java index 60394b2..7e2fd08 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMFormat.java @@ -22,42 +22,54 @@ package org.seqdoop.hadoop_bam; -import java.io.InputStream; import java.io.IOException; - +import java.io.InputStream; import org.apache.hadoop.fs.Path; /** Describes a SAM format. */ public enum SAMFormat { - SAM, BAM, CRAM; + SAM, + BAM, + CRAM; - /** Infers the SAM format by looking at the filename of the given path. - * - * @see #inferFromFilePath(String) - */ - public static SAMFormat inferFromFilePath(final Path path) { - return inferFromFilePath(path.getName()); - } + /** + * Infers the SAM format by looking at the filename of the given path. + * + * @see #inferFromFilePath(String) + */ + public static SAMFormat inferFromFilePath(final Path path) { + return inferFromFilePath(path.getName()); + } - /** Infers the SAM format by looking at the extension of the given file - * name. *.sam is recognized as {@link #SAM}, - * *.bam as {@link #BAM}, and *.bam as {@link #CRAM}. - */ - public static SAMFormat inferFromFilePath(final String name) { - if (name.endsWith(".bam")) return BAM; - if (name.endsWith(".cram")) return CRAM; - if (name.endsWith(".sam")) return SAM; - return null; - } + /** + * Infers the SAM format by looking at the extension of the given file name. *.sam is + * recognized as {@link #SAM}, *.bam as {@link #BAM}, and *.bam as + * {@link #CRAM}. + */ + public static SAMFormat inferFromFilePath(final String name) { + if (name.endsWith(".bam")) { + return BAM; + } + if (name.endsWith(".cram")) { + return CRAM; + } + if (name.endsWith(".sam")) { + return SAM; + } + return null; + } - public static SAMFormat inferFromData(final InputStream in) throws IOException { - final byte b = (byte)in.read(); - in.close(); - switch (b) { - case 0x1f: return SAMFormat.BAM; - case 0x43: return SAMFormat.CRAM; - case '@': return SAMFormat.SAM; - } - return null; - } + public static SAMFormat inferFromData(final InputStream in) throws IOException { + final byte b = (byte) in.read(); + in.close(); + switch (b) { + case 0x1f: + return SAMFormat.BAM; + case 0x43: + return SAMFormat.CRAM; + case '@': + return SAMFormat.SAM; + } + return null; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java index 51e3958..19a393c 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMInputFormat.java @@ -23,34 +23,31 @@ package org.seqdoop.hadoop_bam; import java.io.IOException; - import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -/** An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM files. Values - * are the individual records; see {@link BAMRecordReader} for the meaning of - * the key. +/** + * An {@link org.apache.hadoop.mapreduce.InputFormat} for SAM files. Values are the individual + * records; see {@link BAMRecordReader} for the meaning of the key. */ -public class SAMInputFormat - extends FileInputFormat -{ - /** Returns a {@link SAMRecordReader} initialized with the parameters. */ - @Override public RecordReader - createRecordReader(InputSplit split, TaskAttemptContext ctx) - throws InterruptedException, IOException - { - final RecordReader rr = - new SAMRecordReader(); - rr.initialize(split, ctx); - return rr; - } +public class SAMInputFormat extends FileInputFormat { + + /** Returns a {@link SAMRecordReader} initialized with the parameters. */ + @Override + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { + final RecordReader rr = new SAMRecordReader(); + rr.initialize(split, ctx); + return rr; + } - @Override public boolean isSplitable(JobContext job, Path path) { - return super.isSplitable(job, path); - } + @Override + public boolean isSplitable(JobContext job, Path path) { + return super.isSplitable(job, path); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java index 5de204c..d6159d4 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java @@ -22,309 +22,341 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFormatException; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMTextHeaderCodec; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; import java.io.ByteArrayInputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.StringWriter; import java.io.UnsupportedEncodingException; - import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFormatException; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.SAMTextHeaderCodec; -import htsjdk.samtools.SamInputResource; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; /** See {@link BAMRecordReader} for the meaning of the key. */ -public class SAMRecordReader - extends RecordReader -{ - private LongWritable key = new LongWritable(); - private SAMRecordWritable record = new SAMRecordWritable(); - - private FSDataInputStream input; - private SAMRecordIterator iterator; - private long start, end; - private boolean isInitialized = false; - - private WorkaroundingStream waInput; - - @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) - throws IOException - { - // This method should only be called once (see Hadoop API). However, - // there seems to be disagreement between implementations that call - // initialize() and Hadoop-BAM's own code that relies on - // {@link SAMInputFormat} to call initialize() when the reader is - // created. Therefore we add this check for the time being. - if(isInitialized) - close(); - isInitialized = true; - - final FileSplit split = (FileSplit)spl; - - this.start = split.getStart(); - this.end = start + split.getLength(); - - final Configuration conf = ctx.getConfiguration(); - - final ValidationStringency stringency = - SAMHeaderReader.getValidationStringency(conf); - - final Path file = split.getPath(); - final FileSystem fs = file.getFileSystem(conf); - - input = fs.open(file); - - // SAMFileReader likes to make our life difficult, so complexity ensues. - // The basic problem is that SAMFileReader buffers its input internally, - // which causes two issues. - // - // Issue #1 is that SAMFileReader requires that its input begins with a - // SAM header. This is not fine for reading from the middle of a file. - // Because of the buffering, if we have the reader read the header from - // the beginning of the file and then seek to where we want to read - // records from, it'll have buffered some records from immediately after - // the header, which is no good. Thus we need to read the header - // separately and then use a custom stream that wraps the input stream, - // inserting the header at the beginning of it. (Note the spurious - // re-encoding of the header so that the reader can decode it.) - // - // Issue #2 is handling the boundary between two input splits. The best - // way seems to be the classic "in later splits, skip the first line, and - // in every split finish reading a partial line at the end of the split", - // but that latter part is a bit complicated here. Due to the buffering, - // we can easily overshoot: as soon as the stream moves past the end of - // the split, SAMFileReader has buffered some records past the end. The - // basic fix here is to have our custom stream count the number of bytes - // read and to stop after the split size. Unfortunately this prevents us - // from reading the last partial line, so our stream actually allows - // reading to the next newline after the actual end. - - final SAMFileHeader header = createSamReader(input, stringency).getFileHeader(); - - waInput = new WorkaroundingStream(input, header); - - final boolean firstSplit = this.start == 0; - - if (firstSplit) { - // Skip the header because we already have it, and adjust the start - // to match. - final int headerLength = waInput.getRemainingHeaderLength(); - input.seek(headerLength); - this.start += headerLength; - } else - input.seek(--this.start); - - // Creating the iterator causes reading from the stream, so make sure - // to start counting this early. - waInput.setLength(this.end - this.start); - - iterator = createSamReader(waInput, stringency).iterator(); - - if (!firstSplit) { - // Skip the first line, it'll be handled with the previous split. - try { - if (iterator.hasNext()) - iterator.next(); - } catch (SAMFormatException e) {} - } - } - - private SamReader createSamReader(InputStream in, ValidationStringency stringency) { - SamReaderFactory readerFactory = SamReaderFactory.makeDefault() - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) - .setUseAsyncIo(false); - if (stringency != null) { - readerFactory.validationStringency(stringency); - } - return readerFactory.open(SamInputResource.of(in)); - } - - @Override public void close() throws IOException { iterator.close(); } - - @Override public float getProgress() throws IOException { - final long pos = input.getPos(); - if (pos >= end) - return 1; - else - return (float)(pos - start) / (end - start); - } - @Override public LongWritable getCurrentKey () { return key; } - @Override public SAMRecordWritable getCurrentValue() { return record; } - - @Override public boolean nextKeyValue() { - if (!iterator.hasNext()) - return false; - - final SAMRecord r = iterator.next(); - key.set(BAMRecordReader.getKey(r)); - record.set(r); - return true; - } +public class SAMRecordReader extends RecordReader { + + private LongWritable key = new LongWritable(); + private SAMRecordWritable record = new SAMRecordWritable(); + + private FSDataInputStream input; + private SAMRecordIterator iterator; + private long start, end; + private boolean isInitialized = false; + + private WorkaroundingStream waInput; + + @Override + public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { + // This method should only be called once (see Hadoop API). However, + // there seems to be disagreement between implementations that call + // initialize() and Hadoop-BAM's own code that relies on + // {@link SAMInputFormat} to call initialize() when the reader is + // created. Therefore we add this check for the time being. + if (isInitialized) { + close(); + } + isInitialized = true; + + final FileSplit split = (FileSplit) spl; + + this.start = split.getStart(); + this.end = start + split.getLength(); + + final Configuration conf = ctx.getConfiguration(); + + final ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf); + + final Path file = split.getPath(); + final FileSystem fs = file.getFileSystem(conf); + + input = fs.open(file); + + // SAMFileReader likes to make our life difficult, so complexity ensues. + // The basic problem is that SAMFileReader buffers its input internally, + // which causes two issues. + // + // Issue #1 is that SAMFileReader requires that its input begins with a + // SAM header. This is not fine for reading from the middle of a file. + // Because of the buffering, if we have the reader read the header from + // the beginning of the file and then seek to where we want to read + // records from, it'll have buffered some records from immediately after + // the header, which is no good. Thus we need to read the header + // separately and then use a custom stream that wraps the input stream, + // inserting the header at the beginning of it. (Note the spurious + // re-encoding of the header so that the reader can decode it.) + // + // Issue #2 is handling the boundary between two input splits. The best + // way seems to be the classic "in later splits, skip the first line, and + // in every split finish reading a partial line at the end of the split", + // but that latter part is a bit complicated here. Due to the buffering, + // we can easily overshoot: as soon as the stream moves past the end of + // the split, SAMFileReader has buffered some records past the end. The + // basic fix here is to have our custom stream count the number of bytes + // read and to stop after the split size. Unfortunately this prevents us + // from reading the last partial line, so our stream actually allows + // reading to the next newline after the actual end. + + final SAMFileHeader header = createSamReader(input, stringency).getFileHeader(); + + waInput = new WorkaroundingStream(input, header); + + final boolean firstSplit = this.start == 0; + + if (firstSplit) { + // Skip the header because we already have it, and adjust the start + // to match. + final int headerLength = waInput.getRemainingHeaderLength(); + input.seek(headerLength); + this.start += headerLength; + } else { + input.seek(--this.start); + } + + // Creating the iterator causes reading from the stream, so make sure + // to start counting this early. + waInput.setLength(this.end - this.start); + + iterator = createSamReader(waInput, stringency).iterator(); + + if (!firstSplit) { + // Skip the first line, it'll be handled with the previous split. + try { + if (iterator.hasNext()) { + iterator.next(); + } + } catch (SAMFormatException e) { + } + } + } + + private SamReader createSamReader(InputStream in, ValidationStringency stringency) { + SamReaderFactory readerFactory = + SamReaderFactory.makeDefault() + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setUseAsyncIo(false); + if (stringency != null) { + readerFactory.validationStringency(stringency); + } + return readerFactory.open(SamInputResource.of(in)); + } + + @Override + public void close() throws IOException { + iterator.close(); + } + + @Override + public float getProgress() throws IOException { + final long pos = input.getPos(); + if (pos >= end) { + return 1; + } else { + return (float) (pos - start) / (end - start); + } + } + + @Override + public LongWritable getCurrentKey() { + return key; + } + + @Override + public SAMRecordWritable getCurrentValue() { + return record; + } + + @Override + public boolean nextKeyValue() { + if (!iterator.hasNext()) { + return false; + } + + final SAMRecord r = iterator.next(); + key.set(BAMRecordReader.getKey(r)); + record.set(r); + return true; + } } // See the long comment in SAMRecordReader.initialize() for what this does. class WorkaroundingStream extends InputStream { - private final InputStream stream, headerStream; - private boolean headerRemaining; - private long length; - private int headerLength; - - private boolean lookingForEOL = false, - foundEOL = false, - strippingAts = false; // HACK, see read(byte[], int, int). - - public WorkaroundingStream(InputStream stream, SAMFileHeader header) { - this.stream = stream; - - String text = header.getTextHeader(); - if (text == null) { - StringWriter writer = new StringWriter(); - new SAMTextHeaderCodec().encode(writer, header); - text = writer.toString(); - } - byte[] b; - try { - b = text.getBytes("UTF-8"); - } catch (UnsupportedEncodingException e) { - b = null; - assert false; - } - headerRemaining = true; - headerLength = b.length; - headerStream = new ByteArrayInputStream(b); - - this.length = Long.MAX_VALUE; - } - - public void setLength(long length) { - this.length = length; - } - - public int getRemainingHeaderLength() { - return headerLength; - } - - private byte[] readBuf = new byte[1]; - @Override public int read() throws IOException { - for (;;) switch (read(readBuf)) { - case 0: continue; - case 1: return readBuf[0]; - case -1: return -1; - } - } - - @Override public int read(byte[] buf, int off, int len) throws IOException { - if (!headerRemaining) - return streamRead(buf, off, len); - - int h; - if (strippingAts) - h = 0; - else { - h = headerStream.read(buf, off, len); - if (h == -1) { - // This should only happen when there was no header at all, in - // which case Picard doesn't throw an error until trying to read - // a record, for some reason. (Perhaps an oversight.) Thus we - // need to handle that case here. - assert (headerLength == 0); - h = 0; - } else if (h < headerLength) { - headerLength -= h; - return h; - } - strippingAts = true; - headerStream.close(); - } - - final int newOff = off + h; - int s = streamRead(buf, newOff, len - h); - - if (s <= 0) - return strippingAts ? s : h; - - // HACK HACK HACK. - // - // We gave all of the header, which means that SAMFileReader is still - // trying to read more header lines. If we're in a split that isn't at - // the start of the SAM file, we could be in the middle of a line and - // thus see @ characters at the start of our data. Then SAMFileReader - // would try to understand those as header lines and the end result is - // that it throws an error, since they aren't actually header lines, - // they're just part of a SAM record. - // - // So, if we're done with the header, strip all @ characters we see. Thus - // SAMFileReader will stop reading the header there and won't throw an - // exception until we use its SAMRecordIterator, at which point we can - // catch it, because we know to expect it. - // - // headerRemaining remains true while it's possible that there are still - // @ characters coming. - - int i = newOff-1; - while (buf[++i] == '@' && --s > 0); - - if (i != newOff) - System.arraycopy(buf, i, buf, newOff, s); - - headerRemaining = s == 0; - return h + s; - } - private int streamRead(byte[] buf, int off, int len) throws IOException { - if (len > length) { - if (foundEOL) - return 0; - lookingForEOL = true; - } - int n = stream.read(buf, off, len); - if (n > 0) { - n = tryFindEOL(buf, off, n); - length -= n; - } - return n; - } - private int tryFindEOL(byte[] buf, int off, int len) { - assert !foundEOL; - - if (!lookingForEOL || len < length) - return len; - - // Find the first EOL between length and len. - - // len >= length so length fits in an int. - int i = Math.max(0, (int)length - 1); - - for (; i < len; ++i) { - if (buf[off + i] == '\n') { - foundEOL = true; - return i + 1; - } - } - return len; - } - - @Override public void close() throws IOException { - stream.close(); - } - - @Override public int available() throws IOException { - return headerRemaining ? headerStream.available() : stream.available(); - } + + private final InputStream stream, headerStream; + private boolean headerRemaining; + private long length; + private int headerLength; + + private boolean lookingForEOL = false, + foundEOL = false, + strippingAts = false; // HACK, see read(byte[], int, int). + private byte[] readBuf = new byte[1]; + + public WorkaroundingStream(InputStream stream, SAMFileHeader header) { + this.stream = stream; + + String text = header.getTextHeader(); + if (text == null) { + StringWriter writer = new StringWriter(); + new SAMTextHeaderCodec().encode(writer, header); + text = writer.toString(); + } + byte[] b; + try { + b = text.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + b = null; + assert false; + } + headerRemaining = true; + headerLength = b.length; + headerStream = new ByteArrayInputStream(b); + + this.length = Long.MAX_VALUE; + } + + public void setLength(long length) { + this.length = length; + } + + public int getRemainingHeaderLength() { + return headerLength; + } + + @Override + public int read() throws IOException { + for (; ; ) { + switch (read(readBuf)) { + case 0: + continue; + case 1: + return readBuf[0]; + case -1: + return -1; + } + } + } + + @Override + public int read(byte[] buf, int off, int len) throws IOException { + if (!headerRemaining) { + return streamRead(buf, off, len); + } + + int h; + if (strippingAts) { + h = 0; + } else { + h = headerStream.read(buf, off, len); + if (h == -1) { + // This should only happen when there was no header at all, in + // which case Picard doesn't throw an error until trying to read + // a record, for some reason. (Perhaps an oversight.) Thus we + // need to handle that case here. + assert (headerLength == 0); + h = 0; + } else if (h < headerLength) { + headerLength -= h; + return h; + } + strippingAts = true; + headerStream.close(); + } + + final int newOff = off + h; + int s = streamRead(buf, newOff, len - h); + + if (s <= 0) { + return strippingAts ? s : h; + } + + // HACK HACK HACK. + // + // We gave all of the header, which means that SAMFileReader is still + // trying to read more header lines. If we're in a split that isn't at + // the start of the SAM file, we could be in the middle of a line and + // thus see @ characters at the start of our data. Then SAMFileReader + // would try to understand those as header lines and the end result is + // that it throws an error, since they aren't actually header lines, + // they're just part of a SAM record. + // + // So, if we're done with the header, strip all @ characters we see. Thus + // SAMFileReader will stop reading the header there and won't throw an + // exception until we use its SAMRecordIterator, at which point we can + // catch it, because we know to expect it. + // + // headerRemaining remains true while it's possible that there are still + // @ characters coming. + + int i = newOff - 1; + while (buf[++i] == '@' && --s > 0) {; + } + + if (i != newOff) { + System.arraycopy(buf, i, buf, newOff, s); + } + + headerRemaining = s == 0; + return h + s; + } + + private int streamRead(byte[] buf, int off, int len) throws IOException { + if (len > length) { + if (foundEOL) { + return 0; + } + lookingForEOL = true; + } + int n = stream.read(buf, off, len); + if (n > 0) { + n = tryFindEOL(buf, off, n); + length -= n; + } + return n; + } + + private int tryFindEOL(byte[] buf, int off, int len) { + assert !foundEOL; + + if (!lookingForEOL || len < length) { + return len; + } + + // Find the first EOL between length and len. + + // len >= length so length fits in an int. + int i = Math.max(0, (int) length - 1); + + for (; i < len; ++i) { + if (buf[off + i] == '\n') { + foundEOL = true; + return i + 1; + } + } + return len; + } + + @Override + public void close() throws IOException { + stream.close(); + } + + @Override + public int available() throws IOException { + return headerRemaining ? headerStream.available() : stream.available(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java index 04db587..3bc53f6 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWritable.java @@ -22,54 +22,59 @@ package org.seqdoop.hadoop_bam; -import java.io.DataOutput; +import htsjdk.samtools.BAMRecordCodec; +import htsjdk.samtools.SAMRecord; import java.io.DataInput; +import java.io.DataOutput; import java.io.IOException; - import org.apache.hadoop.io.Writable; - -import htsjdk.samtools.BAMRecordCodec; -import htsjdk.samtools.SAMRecord; - import org.seqdoop.hadoop_bam.util.DataInputWrapper; import org.seqdoop.hadoop_bam.util.DataOutputWrapper; -/** A {@link Writable} {@link SAMRecord}. +/** + * A {@link Writable} {@link SAMRecord}. * - *

In every mapper, the record will have a header, since BAMInputFormat - * provides one. It is lost when transferring the SAMRecord to a reducer, - * however. The current implementation of {@link BAMRecordCodec} does not - * require a record for encoding nor decoding of a SAMRecord, so - * this fortunately doesn't matter for either {@link #write} or {@link - * #readFields}.

+ *

In every mapper, the record will have a header, since BAMInputFormat provides one. It is lost + * when transferring the SAMRecord to a reducer, however. The current implementation of {@link + * BAMRecordCodec} does not require a record for encoding nor decoding of a SAMRecord, + * so this fortunately doesn't matter for either {@link #write} or {@link #readFields}. */ public class SAMRecordWritable implements Writable { - private static final BAMRecordCodec lazyCodec = - new BAMRecordCodec(null, new LazyBAMRecordFactory()); - private SAMRecord record; + private static final BAMRecordCodec lazyCodec = + new BAMRecordCodec(null, new LazyBAMRecordFactory()); + + private SAMRecord record; + + public SAMRecord get() { + return record; + } + + public void set(SAMRecord r) { + record = r; + } - public SAMRecord get() { return record; } - public void set(SAMRecord r) { record = r; } + @Override + public void write(DataOutput out) throws IOException { + // In theory, it shouldn't matter whether we give a header to + // BAMRecordCodec or not, since the representation of an alignment in BAM + // doesn't depend on the header data at all. Only its interpretation + // does, and a simple read/write codec shouldn't really have anything to + // say about that. (But in practice, it already does matter for decode(), + // which is why LazyBAMRecordFactory exists.) + final BAMRecordCodec codec = new BAMRecordCodec(record.getHeader()); + codec.setOutputStream(new DataOutputWrapper(out)); + codec.encode(record); + } - @Override public void write(DataOutput out) throws IOException { - // In theory, it shouldn't matter whether we give a header to - // BAMRecordCodec or not, since the representation of an alignment in BAM - // doesn't depend on the header data at all. Only its interpretation - // does, and a simple read/write codec shouldn't really have anything to - // say about that. (But in practice, it already does matter for decode(), - // which is why LazyBAMRecordFactory exists.) - final BAMRecordCodec codec = new BAMRecordCodec(record.getHeader()); - codec.setOutputStream(new DataOutputWrapper(out)); - codec.encode(record); - } - @Override public void readFields(DataInput in) throws IOException { - lazyCodec.setInputStream(new DataInputWrapper(in)); - record = lazyCodec.decode(); - } + @Override + public void readFields(DataInput in) throws IOException { + lazyCodec.setInputStream(new DataInputWrapper(in)); + record = lazyCodec.decode(); + } - @Override - public String toString() { - return record.getSAMString().trim(); // remove trailing newline - } + @Override + public String toString() { + return record.getSAMString().trim(); // remove trailing newline + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java index a59c9cb..c702753 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordWriter.java @@ -22,83 +22,68 @@ package org.seqdoop.hadoop_bam; -import java.io.IOException; -import java.io.OutputStream; - import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMTextWriter; - +import java.io.IOException; +import java.io.OutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -/** A base {@link RecordWriter} for SAM records. +/** + * A base {@link RecordWriter} for SAM records. * - *

Handles the output stream, writing the header if requested, and provides - * the {@link #writeAlignment} function for subclasses.

+ *

Handles the output stream, writing the header if requested, and provides the {@link + * #writeAlignment} function for subclasses. */ -public abstract class SAMRecordWriter - extends RecordWriter -{ - private SAMTextWriter writer; - private SAMFileHeader header; +public abstract class SAMRecordWriter extends RecordWriter { + + private SAMTextWriter writer; + private SAMFileHeader header; + + /** A SAMFileHeader is read from the input Path. */ + public SAMRecordWriter(Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init( + output, SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), writeHeader, ctx); + } + + public SAMRecordWriter( + Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + } + + public SAMRecordWriter(OutputStream output, SAMFileHeader header, boolean writeHeader) + throws IOException { + init(output, header, writeHeader); + } - /** A SAMFileHeader is read from the input Path. */ - public SAMRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - init( - output, - SAMHeaderReader.readSAMHeaderFrom(input, ctx.getConfiguration()), - writeHeader, ctx); - } - public SAMRecordWriter( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - } - public SAMRecordWriter( - OutputStream output, SAMFileHeader header, boolean writeHeader) - throws IOException - { - init(output, header, writeHeader); - } + private void init(Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); + } - private void init( - Path output, SAMFileHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader); - } - private void init( - OutputStream output, SAMFileHeader header, boolean writeHeader) - throws IOException - { - this.header = header; - writer = new SAMTextWriter(output); + private void init(OutputStream output, SAMFileHeader header, boolean writeHeader) + throws IOException { + this.header = header; + writer = new SAMTextWriter(output); - writer.setSortOrder(header.getSortOrder(), false); - if (writeHeader) - writer.setHeader(header); - } + writer.setSortOrder(header.getSortOrder(), false); + if (writeHeader) { + writer.setHeader(header); + } + } - @Override public void close(TaskAttemptContext ctx) { - writer.close(); - } + @Override + public void close(TaskAttemptContext ctx) { + writer.close(); + } - protected void writeAlignment(final SAMRecord rec) { - rec.setHeader(header); - writer.writeAlignment(rec); - } + protected void writeAlignment(final SAMRecord rec) { + rec.setHeader(header); + writer.writeAlignment(rec); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SequencedFragment.java b/src/main/java/org/seqdoop/hadoop_bam/SequencedFragment.java index dc348dc..3da0f80 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SequencedFragment.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SequencedFragment.java @@ -22,353 +22,489 @@ package org.seqdoop.hadoop_bam; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; - -import java.io.IOException; -import java.io.DataInput; -import java.io.DataOutput; - import org.seqdoop.hadoop_bam.FormatConstants.BaseQualityEncoding; -public class SequencedFragment implements Writable -{ - protected Text sequence = new Text(); - protected Text quality = new Text(); - - protected String instrument; - protected Integer runNumber; - protected String flowcellId; - protected Integer lane; - protected Integer tile; - protected Integer xpos; - protected Integer ypos; - protected Integer read; - protected Boolean filterPassed; - protected Integer controlNumber; - protected String indexSequence; - - // for serialization of nullable fiels - protected static final int Instrument_Present = 0x0001; - protected static final int RunNumber_Present = 0x0002; - protected static final int FlowcellId_Present = 0x0004; - protected static final int Lane_Present = 0x0008; - protected static final int Tile_Present = 0x0010; - protected static final int Xpos_Present = 0x0020; - protected static final int Ypos_Present = 0x0040; - protected static final int Read_Present = 0x0080; - protected static final int FilterPassed_Present = 0x0100; - protected static final int ControlNumber_Present = 0x0200; - protected static final int IndexSequence_Present = 0x0400; - - public void clear() - { - sequence.clear(); - quality.clear(); - - instrument = null; - runNumber = null; - flowcellId = null; - lane = null; - tile = null; - xpos = null; - ypos = null; - read = null; - filterPassed = null; - controlNumber = null; - indexSequence = null; - } - - /** - * Get sequence Text object. - * Trade encapsulation for efficiency. Here we expose the internal Text - * object so that data may be read and written diretly from/to it. - * - * Sequence should always be written using CAPITAL letters and 'N' for unknown bases. - */ - public Text getSequence() { return sequence; } - - /** - * Get quality Text object. - * Trade encapsulation for efficiency. Here we expose the internal Text - * object so that data may be read and written diretly from/to it. - * - * Quality should always be in ASCII-encoded Phred+33 format (sanger). - */ - public Text getQuality() { return quality; } - - public void setInstrument(String v) { instrument = v; } - public void setRunNumber(Integer v) { runNumber = v; } - public void setFlowcellId(String v) { flowcellId = v; } - public void setLane(Integer v) { lane = v; } - public void setTile(Integer v) { tile = v; } - public void setXpos(Integer v) { xpos = v; } - public void setYpos(Integer v) { ypos = v; } - public void setRead(Integer v) { read = v; } - public void setFilterPassed(Boolean v) { filterPassed = v; } - public void setControlNumber(Integer v) { controlNumber = v; } - public void setIndexSequence(String v) { indexSequence = v; } - - public void setSequence(Text seq) - { - if (seq == null) - throw new IllegalArgumentException("can't have a null sequence"); - sequence = seq; - } - - /** - * Set quality. Quality should be encoded in Sanger Phred+33 format. - */ - public void setQuality(Text qual) - { - if (qual == null) - throw new IllegalArgumentException("can't have a null quality"); - quality = qual; - } - - public String getInstrument() { return instrument; } - public Integer getRunNumber() { return runNumber; } - public String getFlowcellId() { return flowcellId; } - public Integer getLane() { return lane; } - public Integer getTile() { return tile; } - public Integer getXpos() { return xpos; } - public Integer getYpos() { return ypos; } - public Integer getRead() { return read; } - public Boolean getFilterPassed() { return filterPassed; } - public Integer getControlNumber() { return controlNumber; } - public String getIndexSequence() { return indexSequence; } - - /** - * Recreates a pseudo qseq record with the fields available. - */ - public String toString() - { - String delim = "\t"; - StringBuilder builder = new StringBuilder(800); - builder.append(instrument).append(delim); - builder.append(runNumber).append(delim); - builder.append(flowcellId).append(delim); - builder.append(lane).append(delim); - builder.append(tile).append(delim); - builder.append(xpos).append(delim); - builder.append(ypos).append(delim); - builder.append(indexSequence).append(delim); - builder.append(read).append(delim); - builder.append(sequence).append(delim); - builder.append(quality).append(delim); - builder.append((filterPassed == null || filterPassed) ? 1 : 0); - return builder.toString(); - } - - public boolean equals(Object other) - { - if (other != null && other instanceof SequencedFragment) - { - SequencedFragment otherFrag = (SequencedFragment)other; - - if (instrument == null && otherFrag.instrument != null || instrument != null && !instrument.equals(otherFrag.instrument)) - return false; - if (runNumber == null && otherFrag.runNumber != null || runNumber != null && !runNumber.equals(otherFrag.runNumber)) - return false; - if (flowcellId == null && otherFrag.flowcellId != null || flowcellId != null && !flowcellId.equals(otherFrag.flowcellId)) - return false; - if (lane == null && otherFrag.lane != null || lane != null && !lane.equals(otherFrag.lane)) - return false; - if (tile == null && otherFrag.tile != null || tile != null && !tile.equals(otherFrag.tile)) - return false; - if (xpos == null && otherFrag.xpos != null || xpos != null && !xpos.equals(otherFrag.xpos)) - return false; - if (ypos == null && otherFrag.ypos != null || ypos != null && !ypos.equals(otherFrag.ypos)) - return false; - if (read == null && otherFrag.read != null || read != null && !read.equals(otherFrag.read)) - return false; - if (filterPassed == null && otherFrag.filterPassed != null || filterPassed != null && !filterPassed.equals(otherFrag.filterPassed)) - return false; - if (controlNumber == null && otherFrag.controlNumber != null || controlNumber != null && !controlNumber.equals(otherFrag.controlNumber)) - return false; - if (indexSequence == null && otherFrag.indexSequence != null || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence)) - return false; - // sequence and quality can't be null - if (!sequence.equals(otherFrag.sequence)) - return false; - if (!quality.equals(otherFrag.quality)) - return false; - - return true; - } - else - return false; - } - - @Override - public int hashCode() { - int result = sequence.hashCode(); - result = 31 * result + quality.hashCode(); - result = 31 * result + (instrument != null ? instrument.hashCode() : 0); - result = 31 * result + (runNumber != null ? runNumber.hashCode() : 0); - result = 31 * result + (flowcellId != null ? flowcellId.hashCode() : 0); - result = 31 * result + (lane != null ? lane.hashCode() : 0); - result = 31 * result + (tile != null ? tile.hashCode() : 0); - result = 31 * result + (xpos != null ? xpos.hashCode() : 0); - result = 31 * result + (ypos != null ? ypos.hashCode() : 0); - result = 31 * result + (read != null ? read.hashCode() : 0); - result = 31 * result + (filterPassed != null ? filterPassed.hashCode() : 0); - result = 31 * result + (controlNumber != null ? controlNumber.hashCode() : 0); - result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0); - return result; - } - - /** - * Convert quality scores in-place. - * - * @throws FormatException if quality scores are out of the range - * allowed by the current encoding. - * @throws IllegalArgumentException if current and target quality encodings are the same. - */ - public static void convertQuality(Text quality, BaseQualityEncoding current, BaseQualityEncoding target) - { - if (current == target) - throw new IllegalArgumentException("current and target quality encodinds are the same (" + current + ")"); - - byte[] bytes = quality.getBytes(); - final int len = quality.getLength(); - final int illuminaSangerDistance = FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET; - - if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger) - { - for (int i = 0; i < len; ++i) - { - if (bytes[i] < FormatConstants.ILLUMINA_OFFSET || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX)) - { - throw new FormatException( - "base quality score out of range for Illumina Phred+64 format (found " + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) + - " but acceptable range is [0," + FormatConstants.ILLUMINA_MAX + "]).\n" + - "Maybe qualities are encoded in Sanger format?\n"); - } - bytes[i] -= illuminaSangerDistance; - } - } - else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina) - { - for (int i = 0; i < len; ++i) - { - if (bytes[i] < FormatConstants.SANGER_OFFSET || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX)) - { - throw new FormatException( - "base quality score out of range for Sanger Phred+64 format (found " + (bytes[i] - FormatConstants.SANGER_OFFSET) + - " but acceptable range is [0," + FormatConstants.SANGER_MAX + "]).\n" + - "Maybe qualities are encoded in Illumina format?\n"); - } - bytes[i] += illuminaSangerDistance; - } - } - else - throw new IllegalArgumentException("unsupported BaseQualityEncoding transformation from " + current + " to " + target); - } - - /** - * Verify that the given quality bytes are within the range allowed for the specified encoding. - * - * In theory, the Sanger encoding uses the entire - * range of characters from ASCII 33 to 126, giving a value range of [0,93]. However, values over 60 are - * unlikely in practice, and are more likely to be caused by mistaking a file that uses Illumina encoding - * for Sanger. So, we'll enforce the same range supported by Illumina encoding ([0,62]) for Sanger. - * - * @return -1 if quality is ok. - * @return If an out-of-range value is found the index of the value is returned. - */ - public static int verifyQuality(Text quality, BaseQualityEncoding encoding) - { - // set allowed quality range - int max, min; - - if (encoding == BaseQualityEncoding.Illumina) - { - max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX; - min = FormatConstants.ILLUMINA_OFFSET; - } - else if (encoding == BaseQualityEncoding.Sanger) - { - max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX; - min = FormatConstants.SANGER_OFFSET; - } - else - throw new IllegalArgumentException("Unsupported base encoding quality " + encoding); - - // verify - final byte[] bytes = quality.getBytes(); - final int len = quality.getLength(); - - for (int i = 0; i < len; ++i) - { - if (bytes[i] < min || bytes[i] > max) - return i; - } - return -1; - } - - public void readFields(DataInput in) throws IOException - { - // TODO: reimplement with a serialization system (e.g. Avro) - - // serialization order: - // 1) sequence - // 2) quality - // 3) int with flags indicating which fields are defined (see *_Present flags) - // 4..end) the rest of the fields - - this.clear(); - - sequence.readFields(in); - quality.readFields(in); - - int presentFlags = WritableUtils.readVInt(in); - if ( (presentFlags & Instrument_Present) != 0) instrument = WritableUtils.readString(in); - if ( (presentFlags & RunNumber_Present) != 0) runNumber = WritableUtils.readVInt(in); - if ( (presentFlags & FlowcellId_Present) != 0) flowcellId = WritableUtils.readString(in); - if ( (presentFlags & Lane_Present) != 0) lane = WritableUtils.readVInt(in); - if ( (presentFlags & Tile_Present) != 0) tile = WritableUtils.readVInt(in); - if ( (presentFlags & Xpos_Present) != 0) xpos = WritableUtils.readVInt(in); - if ( (presentFlags & Ypos_Present) != 0) ypos = WritableUtils.readVInt(in); - if ( (presentFlags & Read_Present) != 0) read = WritableUtils.readVInt(in); - if ( (presentFlags & FilterPassed_Present) != 0) filterPassed = WritableUtils.readVInt(in) == 1; - if ( (presentFlags & ControlNumber_Present) != 0) controlNumber = WritableUtils.readVInt(in); - if ( (presentFlags & IndexSequence_Present) != 0) indexSequence = WritableUtils.readString(in); - } - - public void write(DataOutput out) throws IOException - { - // TODO: reimplement with a serialization system (e.g. Avro) - - sequence.write(out); - quality.write(out); - - int presentFlags = 0; - if (instrument != null) presentFlags |= Instrument_Present; - if (runNumber != null) presentFlags |= RunNumber_Present; - if (flowcellId != null) presentFlags |= FlowcellId_Present; - if (lane != null) presentFlags |= Lane_Present; - if (tile != null) presentFlags |= Tile_Present; - if (xpos != null) presentFlags |= Xpos_Present; - if (ypos != null) presentFlags |= Ypos_Present; - if (read != null) presentFlags |= Read_Present; - if (filterPassed != null) presentFlags |= FilterPassed_Present; - if (controlNumber != null) presentFlags |= ControlNumber_Present; - if (indexSequence != null) presentFlags |= IndexSequence_Present; - - WritableUtils.writeVInt(out, presentFlags); - - if (instrument != null) WritableUtils.writeString(out, instrument); - if (runNumber != null) WritableUtils.writeVInt(out, runNumber); - if (flowcellId != null) WritableUtils.writeString(out, flowcellId); - if (lane != null) WritableUtils.writeVInt(out, lane); - if (tile != null) WritableUtils.writeVInt(out, tile); - if (xpos != null) WritableUtils.writeVInt(out, xpos); - if (ypos != null) WritableUtils.writeVInt(out, ypos); - if (read != null) WritableUtils.writeVInt(out, read); - if (filterPassed != null) WritableUtils.writeVInt(out, filterPassed ? 1 : 0); - if (controlNumber != null) WritableUtils.writeVInt(out, controlNumber); - if (indexSequence != null) WritableUtils.writeString(out, indexSequence); - } +public class SequencedFragment implements Writable { + + // for serialization of nullable fiels + protected static final int Instrument_Present = 0x0001; + protected static final int RunNumber_Present = 0x0002; + protected static final int FlowcellId_Present = 0x0004; + protected static final int Lane_Present = 0x0008; + protected static final int Tile_Present = 0x0010; + protected static final int Xpos_Present = 0x0020; + protected static final int Ypos_Present = 0x0040; + protected static final int Read_Present = 0x0080; + protected static final int FilterPassed_Present = 0x0100; + protected static final int ControlNumber_Present = 0x0200; + protected static final int IndexSequence_Present = 0x0400; + protected Text sequence = new Text(); + protected Text quality = new Text(); + protected String instrument; + protected Integer runNumber; + protected String flowcellId; + protected Integer lane; + protected Integer tile; + protected Integer xpos; + protected Integer ypos; + protected Integer read; + protected Boolean filterPassed; + protected Integer controlNumber; + protected String indexSequence; + + /** + * Convert quality scores in-place. + * + * @throws FormatException if quality scores are out of the range allowed by the current encoding. + * @throws IllegalArgumentException if current and target quality encodings are the same. + */ + public static void convertQuality( + Text quality, BaseQualityEncoding current, BaseQualityEncoding target) { + if (current == target) { + throw new IllegalArgumentException( + "current and target quality encodinds are the same (" + current + ")"); + } + + byte[] bytes = quality.getBytes(); + final int len = quality.getLength(); + final int illuminaSangerDistance = + FormatConstants.ILLUMINA_OFFSET - FormatConstants.SANGER_OFFSET; + + if (current == BaseQualityEncoding.Illumina && target == BaseQualityEncoding.Sanger) { + for (int i = 0; i < len; ++i) { + if (bytes[i] < FormatConstants.ILLUMINA_OFFSET + || bytes[i] > (FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX)) { + throw new FormatException( + "base quality score out of range for Illumina Phred+64 format (found " + + (bytes[i] - FormatConstants.ILLUMINA_OFFSET) + + " but acceptable range is [0," + + FormatConstants.ILLUMINA_MAX + + "]).\n" + + "Maybe qualities are encoded in Sanger format?\n"); + } + bytes[i] -= illuminaSangerDistance; + } + } else if (current == BaseQualityEncoding.Sanger && target == BaseQualityEncoding.Illumina) { + for (int i = 0; i < len; ++i) { + if (bytes[i] < FormatConstants.SANGER_OFFSET + || bytes[i] > (FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX)) { + throw new FormatException( + "base quality score out of range for Sanger Phred+64 format (found " + + (bytes[i] - FormatConstants.SANGER_OFFSET) + + " but acceptable range is [0," + + FormatConstants.SANGER_MAX + + "]).\n" + + "Maybe qualities are encoded in Illumina format?\n"); + } + bytes[i] += illuminaSangerDistance; + } + } else { + throw new IllegalArgumentException( + "unsupported BaseQualityEncoding transformation from " + current + " to " + target); + } + } + + /** + * Verify that the given quality bytes are within the range allowed for the specified encoding. + * + *

In theory, the Sanger encoding uses the entire range of characters from ASCII 33 to 126, + * giving a value range of [0,93]. However, values over 60 are unlikely in practice, and are more + * likely to be caused by mistaking a file that uses Illumina encoding for Sanger. So, we'll + * enforce the same range supported by Illumina encoding ([0,62]) for Sanger. + * + * @return If an out-of-range value is found the index of the value is returned. + */ + public static int verifyQuality(Text quality, BaseQualityEncoding encoding) { + // set allowed quality range + int max, min; + + if (encoding == BaseQualityEncoding.Illumina) { + max = FormatConstants.ILLUMINA_OFFSET + FormatConstants.ILLUMINA_MAX; + min = FormatConstants.ILLUMINA_OFFSET; + } else if (encoding == BaseQualityEncoding.Sanger) { + max = FormatConstants.SANGER_OFFSET + FormatConstants.SANGER_MAX; + min = FormatConstants.SANGER_OFFSET; + } else { + throw new IllegalArgumentException("Unsupported base encoding quality " + encoding); + } + + // verify + final byte[] bytes = quality.getBytes(); + final int len = quality.getLength(); + + for (int i = 0; i < len; ++i) { + if (bytes[i] < min || bytes[i] > max) { + return i; + } + } + return -1; + } + + public void clear() { + sequence.clear(); + quality.clear(); + + instrument = null; + runNumber = null; + flowcellId = null; + lane = null; + tile = null; + xpos = null; + ypos = null; + read = null; + filterPassed = null; + controlNumber = null; + indexSequence = null; + } + + /** + * Get sequence Text object. Trade encapsulation for efficiency. Here we expose the internal Text + * object so that data may be read and written diretly from/to it. + * + *

Sequence should always be written using CAPITAL letters and 'N' for unknown bases. + */ + public Text getSequence() { + return sequence; + } + + public void setSequence(Text seq) { + if (seq == null) { + throw new IllegalArgumentException("can't have a null sequence"); + } + sequence = seq; + } + + /** + * Get quality Text object. Trade encapsulation for efficiency. Here we expose the internal Text + * object so that data may be read and written diretly from/to it. + * + *

Quality should always be in ASCII-encoded Phred+33 format (sanger). + */ + public Text getQuality() { + return quality; + } + + /** Set quality. Quality should be encoded in Sanger Phred+33 format. */ + public void setQuality(Text qual) { + if (qual == null) { + throw new IllegalArgumentException("can't have a null quality"); + } + quality = qual; + } + + public String getInstrument() { + return instrument; + } + + public void setInstrument(String v) { + instrument = v; + } + + public Integer getRunNumber() { + return runNumber; + } + + public void setRunNumber(Integer v) { + runNumber = v; + } + + public String getFlowcellId() { + return flowcellId; + } + + public void setFlowcellId(String v) { + flowcellId = v; + } + + public Integer getLane() { + return lane; + } + + public void setLane(Integer v) { + lane = v; + } + + public Integer getTile() { + return tile; + } + + public void setTile(Integer v) { + tile = v; + } + + public Integer getXpos() { + return xpos; + } + + public void setXpos(Integer v) { + xpos = v; + } + + public Integer getYpos() { + return ypos; + } + + public void setYpos(Integer v) { + ypos = v; + } + + public Integer getRead() { + return read; + } + + public void setRead(Integer v) { + read = v; + } + + public Boolean getFilterPassed() { + return filterPassed; + } + + public void setFilterPassed(Boolean v) { + filterPassed = v; + } + + public Integer getControlNumber() { + return controlNumber; + } + + public void setControlNumber(Integer v) { + controlNumber = v; + } + + public String getIndexSequence() { + return indexSequence; + } + + public void setIndexSequence(String v) { + indexSequence = v; + } + + /** Recreates a pseudo qseq record with the fields available. */ + public String toString() { + String delim = "\t"; + StringBuilder builder = new StringBuilder(800); + builder.append(instrument).append(delim); + builder.append(runNumber).append(delim); + builder.append(flowcellId).append(delim); + builder.append(lane).append(delim); + builder.append(tile).append(delim); + builder.append(xpos).append(delim); + builder.append(ypos).append(delim); + builder.append(indexSequence).append(delim); + builder.append(read).append(delim); + builder.append(sequence).append(delim); + builder.append(quality).append(delim); + builder.append((filterPassed == null || filterPassed) ? 1 : 0); + return builder.toString(); + } + + public boolean equals(Object other) { + if (other != null && other instanceof SequencedFragment) { + SequencedFragment otherFrag = (SequencedFragment) other; + + if (instrument == null && otherFrag.instrument != null + || instrument != null && !instrument.equals(otherFrag.instrument)) { + return false; + } + if (runNumber == null && otherFrag.runNumber != null + || runNumber != null && !runNumber.equals(otherFrag.runNumber)) { + return false; + } + if (flowcellId == null && otherFrag.flowcellId != null + || flowcellId != null && !flowcellId.equals(otherFrag.flowcellId)) { + return false; + } + if (lane == null && otherFrag.lane != null || lane != null && !lane.equals(otherFrag.lane)) { + return false; + } + if (tile == null && otherFrag.tile != null || tile != null && !tile.equals(otherFrag.tile)) { + return false; + } + if (xpos == null && otherFrag.xpos != null || xpos != null && !xpos.equals(otherFrag.xpos)) { + return false; + } + if (ypos == null && otherFrag.ypos != null || ypos != null && !ypos.equals(otherFrag.ypos)) { + return false; + } + if (read == null && otherFrag.read != null || read != null && !read.equals(otherFrag.read)) { + return false; + } + if (filterPassed == null && otherFrag.filterPassed != null + || filterPassed != null && !filterPassed.equals(otherFrag.filterPassed)) { + return false; + } + if (controlNumber == null && otherFrag.controlNumber != null + || controlNumber != null && !controlNumber.equals(otherFrag.controlNumber)) { + return false; + } + if (indexSequence == null && otherFrag.indexSequence != null + || indexSequence != null && !indexSequence.equals(otherFrag.indexSequence)) { + return false; + } + // sequence and quality can't be null + if (!sequence.equals(otherFrag.sequence)) { + return false; + } + if (!quality.equals(otherFrag.quality)) { + return false; + } + + return true; + } else { + return false; + } + } + + @Override + public int hashCode() { + int result = sequence.hashCode(); + result = 31 * result + quality.hashCode(); + result = 31 * result + (instrument != null ? instrument.hashCode() : 0); + result = 31 * result + (runNumber != null ? runNumber.hashCode() : 0); + result = 31 * result + (flowcellId != null ? flowcellId.hashCode() : 0); + result = 31 * result + (lane != null ? lane.hashCode() : 0); + result = 31 * result + (tile != null ? tile.hashCode() : 0); + result = 31 * result + (xpos != null ? xpos.hashCode() : 0); + result = 31 * result + (ypos != null ? ypos.hashCode() : 0); + result = 31 * result + (read != null ? read.hashCode() : 0); + result = 31 * result + (filterPassed != null ? filterPassed.hashCode() : 0); + result = 31 * result + (controlNumber != null ? controlNumber.hashCode() : 0); + result = 31 * result + (indexSequence != null ? indexSequence.hashCode() : 0); + return result; + } + + public void readFields(DataInput in) throws IOException { + // TODO: reimplement with a serialization system (e.g. Avro) + + // serialization order: + // 1) sequence + // 2) quality + // 3) int with flags indicating which fields are defined (see *_Present flags) + // 4..end) the rest of the fields + + this.clear(); + + sequence.readFields(in); + quality.readFields(in); + + int presentFlags = WritableUtils.readVInt(in); + if ((presentFlags & Instrument_Present) != 0) { + instrument = WritableUtils.readString(in); + } + if ((presentFlags & RunNumber_Present) != 0) { + runNumber = WritableUtils.readVInt(in); + } + if ((presentFlags & FlowcellId_Present) != 0) { + flowcellId = WritableUtils.readString(in); + } + if ((presentFlags & Lane_Present) != 0) { + lane = WritableUtils.readVInt(in); + } + if ((presentFlags & Tile_Present) != 0) { + tile = WritableUtils.readVInt(in); + } + if ((presentFlags & Xpos_Present) != 0) { + xpos = WritableUtils.readVInt(in); + } + if ((presentFlags & Ypos_Present) != 0) { + ypos = WritableUtils.readVInt(in); + } + if ((presentFlags & Read_Present) != 0) { + read = WritableUtils.readVInt(in); + } + if ((presentFlags & FilterPassed_Present) != 0) { + filterPassed = WritableUtils.readVInt(in) == 1; + } + if ((presentFlags & ControlNumber_Present) != 0) { + controlNumber = WritableUtils.readVInt(in); + } + if ((presentFlags & IndexSequence_Present) != 0) { + indexSequence = WritableUtils.readString(in); + } + } + + public void write(DataOutput out) throws IOException { + // TODO: reimplement with a serialization system (e.g. Avro) + + sequence.write(out); + quality.write(out); + + int presentFlags = 0; + if (instrument != null) { + presentFlags |= Instrument_Present; + } + if (runNumber != null) { + presentFlags |= RunNumber_Present; + } + if (flowcellId != null) { + presentFlags |= FlowcellId_Present; + } + if (lane != null) { + presentFlags |= Lane_Present; + } + if (tile != null) { + presentFlags |= Tile_Present; + } + if (xpos != null) { + presentFlags |= Xpos_Present; + } + if (ypos != null) { + presentFlags |= Ypos_Present; + } + if (read != null) { + presentFlags |= Read_Present; + } + if (filterPassed != null) { + presentFlags |= FilterPassed_Present; + } + if (controlNumber != null) { + presentFlags |= ControlNumber_Present; + } + if (indexSequence != null) { + presentFlags |= IndexSequence_Present; + } + + WritableUtils.writeVInt(out, presentFlags); + + if (instrument != null) { + WritableUtils.writeString(out, instrument); + } + if (runNumber != null) { + WritableUtils.writeVInt(out, runNumber); + } + if (flowcellId != null) { + WritableUtils.writeString(out, flowcellId); + } + if (lane != null) { + WritableUtils.writeVInt(out, lane); + } + if (tile != null) { + WritableUtils.writeVInt(out, tile); + } + if (xpos != null) { + WritableUtils.writeVInt(out, xpos); + } + if (ypos != null) { + WritableUtils.writeVInt(out, ypos); + } + if (read != null) { + WritableUtils.writeVInt(out, read); + } + if (filterPassed != null) { + WritableUtils.writeVInt(out, filterPassed ? 1 : 0); + } + if (controlNumber != null) { + WritableUtils.writeVInt(out, controlNumber); + } + if (indexSequence != null) { + WritableUtils.writeString(out, indexSequence); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java b/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java index af72270..ffa91af 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndex.java @@ -25,131 +25,145 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.NavigableSet; import java.util.TreeSet; -/** An index into BAM files, for {@link BAMInputFormat}. Reads files that are - * created by {@link SplittingBAMIndexer}. +/** + * An index into BAM files, for {@link BAMInputFormat}. Reads files that are created by {@link + * SplittingBAMIndexer}. * - *

Indexes the positions of individual BAM records in the file.

+ *

Indexes the positions of individual BAM records in the file. */ public final class SplittingBAMIndex { - private final NavigableSet virtualOffsets = new TreeSet(); - - public SplittingBAMIndex() {} - public SplittingBAMIndex(final File path) throws IOException { - this(new BufferedInputStream(new FileInputStream(path))); - } - public SplittingBAMIndex(final InputStream in) throws IOException { - readIndex(in); - } - - public void readIndex(final InputStream in) throws IOException { - virtualOffsets.clear(); - - final ByteBuffer bb = ByteBuffer.allocate(8); - - for (long prev = -1; in.read(bb.array()) == 8;) { - final long cur = bb.getLong(0); - if (prev > cur) - throw new IOException(String.format( - "Invalid splitting BAM index; offsets not in order: %#x > %#x", - prev, cur)); - - virtualOffsets.add(prev = cur); - } - in.close(); - - if (virtualOffsets.size() < 1) - throw new IOException( - "Invalid splitting BAM index: "+ - "should contain at least the file size"); - } - - public List getVirtualOffsets() { - return new ArrayList<>(virtualOffsets); - } - - public Long prevAlignment(final long filePos) { - return virtualOffsets.floor(filePos << 16); - } - public Long nextAlignment(final long filePos) { - return virtualOffsets.higher(filePos << 16); - } - - public int size() { return virtualOffsets.size(); } - - private long first() { return virtualOffsets.first(); } - private long last() { return prevAlignment(bamSize() - 1); } - long bamSize() { return virtualOffsets.last() >>> 16; } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - SplittingBAMIndex that = (SplittingBAMIndex) o; - - return virtualOffsets != null ? virtualOffsets.equals(that.virtualOffsets) : that - .virtualOffsets == null; - - } - - @Override - public int hashCode() { - return virtualOffsets != null ? virtualOffsets.hashCode() : 0; - } - - @Override - public String toString() { - return virtualOffsets.toString(); - } - - /** Writes some statistics about each splitting BAM index file given as an - * argument. - */ - public static void main(String[] args) { - if (args.length == 0) { - System.out.println( - "Usage: SplittingBAMIndex [splitting BAM indices...]\n\n"+ - - "Writes a few statistics about each splitting BAM index."); - return; - } - - for (String arg : args) { - final File f = new File(arg); - if (f.isFile() && f.canRead()) { - try { - System.err.printf("%s:\n", f); - final SplittingBAMIndex bi = new SplittingBAMIndex(f); - if (bi.size() == 1) { - System.err.printf("\t0 alignments\n" + - "\tassociated BAM file size %d\n", bi.bamSize()); - } else { - final long first = bi.first(); - final long last = bi.last(); - System.err.printf( - "\t%d alignments\n" + - "\tfirst is at %#06x in BGZF block at %#014x\n" + - "\tlast is at %#06x in BGZF block at %#014x\n" + - "\tassociated BAM file size %d\n", - bi.size(), - first & 0xffff, first >>> 16, - last & 0xffff, last >>> 16, - bi.bamSize()); - } - } catch (IOException e) { - System.err.printf("Failed to read %s!\n", f); - e.printStackTrace(); - } - } else - System.err.printf("%s does not look like a readable file!\n", f); - } - } + + private final NavigableSet virtualOffsets = new TreeSet(); + + public SplittingBAMIndex() {} + + public SplittingBAMIndex(final File path) throws IOException { + this(new BufferedInputStream(new FileInputStream(path))); + } + + public SplittingBAMIndex(final InputStream in) throws IOException { + readIndex(in); + } + + /** Writes some statistics about each splitting BAM index file given as an argument. */ + public static void main(String[] args) { + if (args.length == 0) { + System.out.println( + "Usage: SplittingBAMIndex [splitting BAM indices...]\n\n" + + "Writes a few statistics about each splitting BAM index."); + return; + } + + for (String arg : args) { + final File f = new File(arg); + if (f.isFile() && f.canRead()) { + try { + System.err.printf("%s:\n", f); + final SplittingBAMIndex bi = new SplittingBAMIndex(f); + if (bi.size() == 1) { + System.err.printf("\t0 alignments\n" + "\tassociated BAM file size %d\n", bi.bamSize()); + } else { + final long first = bi.first(); + final long last = bi.last(); + System.err.printf( + "\t%d alignments\n" + + "\tfirst is at %#06x in BGZF block at %#014x\n" + + "\tlast is at %#06x in BGZF block at %#014x\n" + + "\tassociated BAM file size %d\n", + bi.size(), first & 0xffff, first >>> 16, last & 0xffff, last >>> 16, bi.bamSize()); + } + } catch (IOException e) { + System.err.printf("Failed to read %s!\n", f); + e.printStackTrace(); + } + } else { + System.err.printf("%s does not look like a readable file!\n", f); + } + } + } + + public void readIndex(final InputStream in) throws IOException { + virtualOffsets.clear(); + + final ByteBuffer bb = ByteBuffer.allocate(8); + + for (long prev = -1; in.read(bb.array()) == 8; ) { + final long cur = bb.getLong(0); + if (prev > cur) { + throw new IOException( + String.format( + "Invalid splitting BAM index; offsets not in order: %#x > %#x", prev, cur)); + } + + virtualOffsets.add(prev = cur); + } + in.close(); + + if (virtualOffsets.size() < 1) { + throw new IOException( + "Invalid splitting BAM index: " + "should contain at least the file size"); + } + } + + public List getVirtualOffsets() { + return new ArrayList<>(virtualOffsets); + } + + public Long prevAlignment(final long filePos) { + return virtualOffsets.floor(filePos << 16); + } + + public Long nextAlignment(final long filePos) { + return virtualOffsets.higher(filePos << 16); + } + + public int size() { + return virtualOffsets.size(); + } + + private long first() { + return virtualOffsets.first(); + } + + private long last() { + return prevAlignment(bamSize() - 1); + } + + long bamSize() { + return virtualOffsets.last() >>> 16; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + SplittingBAMIndex that = (SplittingBAMIndex) o; + + return virtualOffsets != null + ? virtualOffsets.equals(that.virtualOffsets) + : that.virtualOffsets == null; + } + + @Override + public int hashCode() { + return virtualOffsets != null ? virtualOffsets.hashCode() : 0; + } + + @Override + public String toString() { + return virtualOffsets.toString(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndexer.java b/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndexer.java index 59b27c3..939469e 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndexer.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SplittingBAMIndexer.java @@ -25,12 +25,13 @@ import htsjdk.samtools.SAMFileSource; import htsjdk.samtools.SAMFileSpan; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.BlockCompressedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; @@ -38,340 +39,335 @@ import java.nio.ByteOrder; import java.nio.LongBuffer; import java.util.Arrays; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import htsjdk.samtools.util.BlockCompressedInputStream; - /** * An indexing tool and API for BAM files, making them palatable to {@link - * org.seqdoop.hadoop_bam.BAMInputFormat}. Writes splitting BAM indices as - * understood by {@link org.seqdoop.hadoop_bam.SplittingBAMIndex}. + * org.seqdoop.hadoop_bam.BAMInputFormat}. Writes splitting BAM indices as understood by {@link + * org.seqdoop.hadoop_bam.SplittingBAMIndex}. * - * There are two ways of using this class: - * 1) Building a splitting BAM index from an existing BAM file - * 2) Building a splitting BAM index while building the BAM file + *

There are two ways of using this class: 1) Building a splitting BAM index from an existing BAM + * file 2) Building a splitting BAM index while building the BAM file * - * For 1), use the static {@link #index(InputStream, OutputStream, long, int)} method, - * which takes the input BAM and output stream to write the index to. + *

For 1), use the static {@link #index(InputStream, OutputStream, long, int)} method, which + * takes the input BAM and output stream to write the index to. * - * For 2), use one of the constructors that takes an output stream, then pass {@link - * SAMRecord} objects via the {@link #processAlignment} method, and then call {@link - * #finish(long)} to complete writing the index. + *

For 2), use one of the constructors that takes an output stream, then pass {@link SAMRecord} + * objects via the {@link #processAlignment} method, and then call {@link #finish(long)} to complete + * writing the index. */ public final class SplittingBAMIndexer { - public static final String OUTPUT_FILE_EXTENSION = ".splitting-bai"; - - // Default to a granularity level of 4096. This is generally sufficient - // for very large BAM files, relative to a maximum heap size in the - // gigabyte range. - public static final int DEFAULT_GRANULARITY = 4096; - - public static void main(String[] args) { - if (args.length <= 1) { - System.out.println( - "Usage: SplittingBAMIndexer GRANULARITY [BAM files...]\n\n"+ - - "Writes, for each GRANULARITY alignments in a BAM file, its "+ - "virtual file offset\nas a big-endian 64-bit integer into "+ - "[filename].splitting-bai. The file is\nterminated by the BAM "+ - "file's length, in the same format."); - return; - } - - int granularity; - try { - granularity = Integer.parseInt(args[0]); - } catch (NumberFormatException e) { - granularity = 0; - } - if (granularity <= 0) { - System.err.printf( - "Granularity must be a positive integer, not '%s'!\n", args[0]); - return; - } - - for (final String arg : Arrays.asList(args).subList(1, args.length)) { - final File f = new File(arg); - System.out.printf("Indexing %s...", f); - try { - SplittingBAMIndexer.index( - new FileInputStream(f), - new BufferedOutputStream(new FileOutputStream(f + OUTPUT_FILE_EXTENSION)), - f.length(), granularity); - System.out.println(" done."); - } catch (IOException e) { - System.out.println(" FAILED!"); - e.printStackTrace(); - } - } - } - - /** - * Invoke a new SplittingBAMIndexer object, operating on the supplied {@link - * org.apache.hadoop.conf.Configuration} object instead of a supplied - * argument list - * - * @throws java.lang.IllegalArgumentException if the "input" property is not - * in the Configuration - */ - public static void run(final Configuration conf) throws IOException { - final String inputString = conf.get("input"); - if (inputString == null) - throw new IllegalArgumentException( - "String property \"input\" path not found in given Configuration"); - - final FileSystem fs = FileSystem.get(conf); - - final Path input = new Path(inputString); - - SplittingBAMIndexer.index( - fs.open(input), - fs.create(input.suffix(OUTPUT_FILE_EXTENSION)), - fs.getFileStatus(input).getLen(), - conf.getInt("granularity", DEFAULT_GRANULARITY)); - } - - private final OutputStream out; - private final ByteBuffer byteBuffer = ByteBuffer.allocate(8); - private final int granularity; - private final LongBuffer lb; - private long count; - private Method getFirstOffset; - - private static final int PRINT_EVERY = 500*1024*1024; - - /** - * Prepare to index a BAM file. - * @param out the stream to write the index to - */ - public SplittingBAMIndexer(final OutputStream out) { - this(out, SplittingBAMIndexer.DEFAULT_GRANULARITY); - } - - /** - * Prepare to index a BAM file. - * @param out the stream to write the index to - * @param granularity write the offset of every n-th alignment to the index - */ - public SplittingBAMIndexer(final OutputStream out, final int granularity) { - this.out = out; - this.lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); - this.granularity = granularity; - } - - /** - * Process the given record for the index. - * @param rec the record from the file being indexed - * @throws IOException - */ - public void processAlignment(final SAMRecord rec) throws IOException { - // write an offset for the first record and for the g-th record thereafter (where - // g is the granularity), to be consistent with the index method - if (count == 0 || (count + 1) % granularity == 0) { - SAMFileSource fileSource = rec.getFileSource(); - SAMFileSpan filePointer = fileSource.getFilePointer(); - writeVirtualOffset(getPos(filePointer)); - } - count++; - } - - void processAlignment(final long virtualOffset) throws IOException { - if (count == 0 || (count + 1) % granularity == 0) { - writeVirtualOffset(virtualOffset); - } - count++; - } - - private long getPos(SAMFileSpan filePointer) { - // Use reflection since BAMFileSpan is package private in htsjdk 1.141. Note that - // Hadoop-BAM cannot use a later version of htsjdk since it requires Java 8. - if (getFirstOffset == null) { - try { - getFirstOffset = filePointer.getClass().getDeclaredMethod("getFirstOffset"); - getFirstOffset.setAccessible(true); - } catch (NoSuchMethodException e) { - throw new IllegalStateException(e); - } - } - try { - return (Long) getFirstOffset.invoke(filePointer); - } catch (IllegalAccessException e) { - throw new IllegalStateException(e); - } catch (InvocationTargetException e) { - throw new IllegalStateException(e); - } - } - - /** - * Write the given virtual offset to the index. This method is for internal use only. - * @param virtualOffset virtual file pointer - * @throws IOException - */ - public void writeVirtualOffset(long virtualOffset) throws IOException { - lb.put(0, virtualOffset); - out.write(byteBuffer.array()); - } - - /** - * Complete the index by writing the input BAM file size to the index, and closing - * the output stream. - * @param inputSize the size of the input BAM file - * @throws IOException - */ - public void finish(long inputSize) throws IOException { - writeVirtualOffset(inputSize << 16); - out.close(); - } - - /** - * Perform indexing on the given BAM file, at the granularity level specified. - */ - public static void index( - final InputStream rawIn, final OutputStream out, final long inputSize, - final int granularity) - throws IOException - { - final BlockCompressedInputStream in = - new BlockCompressedInputStream(rawIn); - - final ByteBuffer byteBuffer = ByteBuffer.allocate(8); // Enough to fit a long - final LongBuffer lb = - byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); - - skipToAlignmentList(byteBuffer, in); - - // Always write the first one to make sure it's not skipped - lb.put(0, in.getFilePointer()); - out.write(byteBuffer.array()); - - long prevPrint = in.getFilePointer() >> 16; - - for (int i = 0;;) { - final PtrSkipPair pair = readAlignment(byteBuffer, in); - if (pair == null) - break; - - if (++i == granularity) { - i = 0; - lb.put(0, pair.ptr); - out.write(byteBuffer.array()); - - final long filePos = pair.ptr >> 16; - if (filePos - prevPrint >= PRINT_EVERY) { - System.out.print("-"); - prevPrint = filePos; - } - } - fullySkip(in, pair.skip); - } - lb.put(0, inputSize << 16); - out.write(byteBuffer.array()); - out.close(); - in.close(); - } - - private static void skipToAlignmentList(final ByteBuffer byteBuffer, final InputStream in) - throws IOException { - // Check magic number - if (!readExactlyBytes(byteBuffer, in, 4)) - ioError("Invalid BAM header: too short, no magic"); - - final int magic = byteBuffer.order(ByteOrder.BIG_ENDIAN).getInt(0); - if (magic != 0x42414d01) - ioError("Invalid BAM header: bad magic %#x != 0x42414d01", magic); - - // Skip the SAM header - if (!readExactlyBytes(byteBuffer, in, 4)) - ioError("Invalid BAM header: too short, no SAM header length"); - - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - final int samLen = byteBuffer.getInt(0); - if (samLen < 0) - ioError("Invalid BAM header: negative SAM header length %d", samLen); - - fullySkip(in, samLen); - - // Get the number of reference sequences - if (!readExactlyBytes(byteBuffer, in, 4)) - ioError("Invalid BAM header: too short, no reference sequence count"); - - final int referenceSeqs = byteBuffer.getInt(0); - - // Skip over each reference sequence datum individually - for (int s = 0; s < referenceSeqs; ++s) { - if (!readExactlyBytes(byteBuffer, in, 4)) - ioError("Invalid reference list: EOF before reference %d", s+1); - - // Skip over the name + the int giving the sequence length - fullySkip(in, byteBuffer.getInt(0) + 4); - } - } - - private static final class PtrSkipPair { - public long ptr; - public int skip; - - public PtrSkipPair(long p, int s) { - ptr = p; - skip = s; - } - } - - private static PtrSkipPair readAlignment(final ByteBuffer byteBuffer, - final BlockCompressedInputStream in) throws IOException - { - final long ptr = in.getFilePointer(); - final int read = readBytes(byteBuffer, in, 4); - if (read != 4) { - if (read == 0) - return null; - ioError( - "Invalid alignment at virtual offset %#x: "+ - "less than 4 bytes long", in.getFilePointer()); - } - return new PtrSkipPair(ptr, byteBuffer.getInt(0)); - } - - private static void fullySkip(final InputStream in, final int skip) - throws IOException - { - // Skip repeatedly until we're either done skipping or can't skip any - // more, in case some kind of IO error is temporarily preventing it. That - // kind of situation might not necessarily be possible; the docs are - // rather vague about the whole thing. - for (int s = skip; s > 0;) { - final long skipped = in.skip(s); - if (skipped == 0) - throw new IOException("Skip failed"); - s -= skipped; - } - } - - private static int readBytes(final ByteBuffer byteBuffer, final InputStream in, - final int n) throws IOException - { - assert n <= byteBuffer.capacity(); - - int read = 0; - while (read < n) { - final int readNow = in.read(byteBuffer.array(), read, n - read); - if (readNow <= 0) - break; - read += readNow; - } - return read; - } - private static boolean readExactlyBytes(final ByteBuffer byteBuffer, - final InputStream in, final int n) throws IOException - { - return readBytes(byteBuffer, in, n) == n; - } - - private static void ioError(String s, Object... va) throws IOException { - throw new IOException(String.format(s, va)); - } + + public static final String OUTPUT_FILE_EXTENSION = ".splitting-bai"; + + // Default to a granularity level of 4096. This is generally sufficient + // for very large BAM files, relative to a maximum heap size in the + // gigabyte range. + public static final int DEFAULT_GRANULARITY = 4096; + private static final int PRINT_EVERY = 500 * 1024 * 1024; + private final OutputStream out; + private final ByteBuffer byteBuffer = ByteBuffer.allocate(8); + private final int granularity; + private final LongBuffer lb; + private long count; + private Method getFirstOffset; + /** + * Prepare to index a BAM file. + * + * @param out the stream to write the index to + */ + public SplittingBAMIndexer(final OutputStream out) { + this(out, SplittingBAMIndexer.DEFAULT_GRANULARITY); + } + + /** + * Prepare to index a BAM file. + * + * @param out the stream to write the index to + * @param granularity write the offset of every n-th alignment to the index + */ + public SplittingBAMIndexer(final OutputStream out, final int granularity) { + this.out = out; + this.lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); + this.granularity = granularity; + } + + public static void main(String[] args) { + if (args.length <= 1) { + System.out.println( + "Usage: SplittingBAMIndexer GRANULARITY [BAM files...]\n\n" + + "Writes, for each GRANULARITY alignments in a BAM file, its " + + "virtual file offset\nas a big-endian 64-bit integer into " + + "[filename].splitting-bai. The file is\nterminated by the BAM " + + "file's length, in the same format."); + return; + } + + int granularity; + try { + granularity = Integer.parseInt(args[0]); + } catch (NumberFormatException e) { + granularity = 0; + } + if (granularity <= 0) { + System.err.printf("Granularity must be a positive integer, not '%s'!\n", args[0]); + return; + } + + for (final String arg : Arrays.asList(args).subList(1, args.length)) { + final File f = new File(arg); + System.out.printf("Indexing %s...", f); + try { + SplittingBAMIndexer.index( + new FileInputStream(f), + new BufferedOutputStream(new FileOutputStream(f + OUTPUT_FILE_EXTENSION)), + f.length(), + granularity); + System.out.println(" done."); + } catch (IOException e) { + System.out.println(" FAILED!"); + e.printStackTrace(); + } + } + } + + /** + * Invoke a new SplittingBAMIndexer object, operating on the supplied {@link + * org.apache.hadoop.conf.Configuration} object instead of a supplied argument list + * + * @throws java.lang.IllegalArgumentException if the "input" property is not in the Configuration + */ + public static void run(final Configuration conf) throws IOException { + final String inputString = conf.get("input"); + if (inputString == null) { + throw new IllegalArgumentException( + "String property \"input\" path not found in given Configuration"); + } + + final FileSystem fs = FileSystem.get(conf); + + final Path input = new Path(inputString); + + SplittingBAMIndexer.index( + fs.open(input), + fs.create(input.suffix(OUTPUT_FILE_EXTENSION)), + fs.getFileStatus(input).getLen(), + conf.getInt("granularity", DEFAULT_GRANULARITY)); + } + + /** Perform indexing on the given BAM file, at the granularity level specified. */ + public static void index( + final InputStream rawIn, final OutputStream out, final long inputSize, final int granularity) + throws IOException { + final BlockCompressedInputStream in = new BlockCompressedInputStream(rawIn); + + final ByteBuffer byteBuffer = ByteBuffer.allocate(8); // Enough to fit a long + final LongBuffer lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); + + skipToAlignmentList(byteBuffer, in); + + // Always write the first one to make sure it's not skipped + lb.put(0, in.getFilePointer()); + out.write(byteBuffer.array()); + + long prevPrint = in.getFilePointer() >> 16; + + for (int i = 0; ; ) { + final PtrSkipPair pair = readAlignment(byteBuffer, in); + if (pair == null) { + break; + } + + if (++i == granularity) { + i = 0; + lb.put(0, pair.ptr); + out.write(byteBuffer.array()); + + final long filePos = pair.ptr >> 16; + if (filePos - prevPrint >= PRINT_EVERY) { + System.out.print("-"); + prevPrint = filePos; + } + } + fullySkip(in, pair.skip); + } + lb.put(0, inputSize << 16); + out.write(byteBuffer.array()); + out.close(); + in.close(); + } + + private static void skipToAlignmentList(final ByteBuffer byteBuffer, final InputStream in) + throws IOException { + // Check magic number + if (!readExactlyBytes(byteBuffer, in, 4)) { + ioError("Invalid BAM header: too short, no magic"); + } + + final int magic = byteBuffer.order(ByteOrder.BIG_ENDIAN).getInt(0); + if (magic != 0x42414d01) { + ioError("Invalid BAM header: bad magic %#x != 0x42414d01", magic); + } + + // Skip the SAM header + if (!readExactlyBytes(byteBuffer, in, 4)) { + ioError("Invalid BAM header: too short, no SAM header length"); + } + + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + + final int samLen = byteBuffer.getInt(0); + if (samLen < 0) { + ioError("Invalid BAM header: negative SAM header length %d", samLen); + } + + fullySkip(in, samLen); + + // Get the number of reference sequences + if (!readExactlyBytes(byteBuffer, in, 4)) { + ioError("Invalid BAM header: too short, no reference sequence count"); + } + + final int referenceSeqs = byteBuffer.getInt(0); + + // Skip over each reference sequence datum individually + for (int s = 0; s < referenceSeqs; ++s) { + if (!readExactlyBytes(byteBuffer, in, 4)) { + ioError("Invalid reference list: EOF before reference %d", s + 1); + } + + // Skip over the name + the int giving the sequence length + fullySkip(in, byteBuffer.getInt(0) + 4); + } + } + + private static PtrSkipPair readAlignment( + final ByteBuffer byteBuffer, final BlockCompressedInputStream in) throws IOException { + final long ptr = in.getFilePointer(); + final int read = readBytes(byteBuffer, in, 4); + if (read != 4) { + if (read == 0) { + return null; + } + ioError( + "Invalid alignment at virtual offset %#x: " + "less than 4 bytes long", + in.getFilePointer()); + } + return new PtrSkipPair(ptr, byteBuffer.getInt(0)); + } + + private static void fullySkip(final InputStream in, final int skip) throws IOException { + // Skip repeatedly until we're either done skipping or can't skip any + // more, in case some kind of IO error is temporarily preventing it. That + // kind of situation might not necessarily be possible; the docs are + // rather vague about the whole thing. + for (int s = skip; s > 0; ) { + final long skipped = in.skip(s); + if (skipped == 0) { + throw new IOException("Skip failed"); + } + s -= skipped; + } + } + + private static int readBytes(final ByteBuffer byteBuffer, final InputStream in, final int n) + throws IOException { + assert n <= byteBuffer.capacity(); + + int read = 0; + while (read < n) { + final int readNow = in.read(byteBuffer.array(), read, n - read); + if (readNow <= 0) { + break; + } + read += readNow; + } + return read; + } + + private static boolean readExactlyBytes( + final ByteBuffer byteBuffer, final InputStream in, final int n) throws IOException { + return readBytes(byteBuffer, in, n) == n; + } + + private static void ioError(String s, Object... va) throws IOException { + throw new IOException(String.format(s, va)); + } + + /** + * Process the given record for the index. + * + * @param rec the record from the file being indexed + */ + public void processAlignment(final SAMRecord rec) throws IOException { + // write an offset for the first record and for the g-th record thereafter (where + // g is the granularity), to be consistent with the index method + if (count == 0 || (count + 1) % granularity == 0) { + SAMFileSource fileSource = rec.getFileSource(); + SAMFileSpan filePointer = fileSource.getFilePointer(); + writeVirtualOffset(getPos(filePointer)); + } + count++; + } + + void processAlignment(final long virtualOffset) throws IOException { + if (count == 0 || (count + 1) % granularity == 0) { + writeVirtualOffset(virtualOffset); + } + count++; + } + + private long getPos(SAMFileSpan filePointer) { + // Use reflection since BAMFileSpan is package private in htsjdk 1.141. Note that + // Hadoop-BAM cannot use a later version of htsjdk since it requires Java 8. + if (getFirstOffset == null) { + try { + getFirstOffset = filePointer.getClass().getDeclaredMethod("getFirstOffset"); + getFirstOffset.setAccessible(true); + } catch (NoSuchMethodException e) { + throw new IllegalStateException(e); + } + } + try { + return (Long) getFirstOffset.invoke(filePointer); + } catch (IllegalAccessException e) { + throw new IllegalStateException(e); + } catch (InvocationTargetException e) { + throw new IllegalStateException(e); + } + } + + /** + * Write the given virtual offset to the index. This method is for internal use only. + * + * @param virtualOffset virtual file pointer + */ + public void writeVirtualOffset(long virtualOffset) throws IOException { + lb.put(0, virtualOffset); + out.write(byteBuffer.array()); + } + + /** + * Complete the index by writing the input BAM file size to the index, and closing the output + * stream. + * + * @param inputSize the size of the input BAM file + */ + public void finish(long inputSize) throws IOException { + writeVirtualOffset(inputSize << 16); + out.close(); + } + + private static final class PtrSkipPair { + + public long ptr; + public int skip; + + public PtrSkipPair(long p, int s) { + ptr = p; + skip = s; + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java b/src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java index 890ce33..d5112f0 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VCFFormat.java @@ -22,63 +22,69 @@ package org.seqdoop.hadoop_bam; -import htsjdk.samtools.util.BlockCompressedInputStream; import java.io.BufferedInputStream; -import java.io.InputStream; import java.io.IOException; - +import java.io.InputStream; import java.util.zip.GZIPInputStream; import org.apache.hadoop.fs.Path; /** Describes a VCF format. */ public enum VCFFormat { - VCF, BCF; - - /** Infers the VCF format by looking at the filename of the given path. - * - * @see #inferFromFilePath(String) - */ - public static VCFFormat inferFromFilePath(final Path path) { - return inferFromFilePath(path.getName()); - } + VCF, + BCF; - /** Infers the VCF format by looking at the extension of the given file - * name. *.vcf is recognized as {@link #VCF} and - * *.bcf as {@link #BCF}. - */ - public static VCFFormat inferFromFilePath(final String name) { - if (name.endsWith(".bcf")) return BCF; - if (name.endsWith(".vcf")) return VCF; - if (name.endsWith(".gz")) return VCF; - if (name.endsWith(".bgz")) return VCF; - return null; - } + /** + * Infers the VCF format by looking at the filename of the given path. + * + * @see #inferFromFilePath(String) + */ + public static VCFFormat inferFromFilePath(final Path path) { + return inferFromFilePath(path.getName()); + } - /** Infers the VCF format by looking at the first few bytes of the input. - */ - public static VCFFormat inferFromData(final InputStream in) throws IOException { - BufferedInputStream bis = new BufferedInputStream(in); // so mark/reset is supported - return inferFromUncompressedData(isGzip(bis) ? new GZIPInputStream(bis) : bis); - } + /** + * Infers the VCF format by looking at the extension of the given file name. *.vcf is + * recognized as {@link #VCF} and *.bcf as {@link #BCF}. + */ + public static VCFFormat inferFromFilePath(final String name) { + if (name.endsWith(".bcf")) { + return BCF; + } + if (name.endsWith(".vcf")) { + return VCF; + } + if (name.endsWith(".gz")) { + return VCF; + } + if (name.endsWith(".bgz")) { + return VCF; + } + return null; + } - private static VCFFormat inferFromUncompressedData(final InputStream in) throws IOException { - final byte b = (byte)in.read(); - in.close(); - switch (b) { - case 'B': return BCF; - case '#': return VCF; - } - return null; - } + /** Infers the VCF format by looking at the first few bytes of the input. */ + public static VCFFormat inferFromData(final InputStream in) throws IOException { + BufferedInputStream bis = new BufferedInputStream(in); // so mark/reset is supported + return inferFromUncompressedData(isGzip(bis) ? new GZIPInputStream(bis) : bis); + } - /** - * @return true if the stream is compressed with gzip (or BGZF) - */ - public static boolean isGzip(final InputStream in) throws IOException { - in.mark(1); - final byte b = (byte)in.read(); - in.reset(); - return b == 0x1f; - } + private static VCFFormat inferFromUncompressedData(final InputStream in) throws IOException { + final byte b = (byte) in.read(); + in.close(); + switch (b) { + case 'B': + return BCF; + case '#': + return VCF; + } + return null; + } + /** @return true if the stream is compressed with gzip (or BGZF) */ + public static boolean isGzip(final InputStream in) throws IOException { + in.mark(1); + final byte b = (byte) in.read(); + in.reset(); + return b == 0x1f; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VCFInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/VCFInputFormat.java index 846ec6d..c45c85f 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VCFInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VCFInputFormat.java @@ -22,6 +22,7 @@ package org.seqdoop.hadoop_bam; +import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.Locatable; @@ -39,7 +40,6 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; - import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; @@ -50,429 +50,431 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SplittableCompressionCodec; -import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; - -import htsjdk.samtools.seekablestream.SeekableStream; - -import org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec; import org.seqdoop.hadoop_bam.util.BGZFCodec; +import org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec; import org.seqdoop.hadoop_bam.util.IntervalUtil; import org.seqdoop.hadoop_bam.util.WrapSeekable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** An {@link org.apache.hadoop.mapreduce.InputFormat} for VCF files. Values - * are the individual records; see {@link VCFRecordReader} for the meaning of - * the key. +/** + * An {@link org.apache.hadoop.mapreduce.InputFormat} for VCF files. Values are the individual + * records; see {@link VCFRecordReader} for the meaning of the key. */ -public class VCFInputFormat - extends FileInputFormat -{ - private static final Logger logger = LoggerFactory.getLogger(VCFInputFormat.class); - - /** Whether file extensions are to be trusted, defaults to true. - * - * @see VCFFormat#inferFromFilePath - */ - - public static final String TRUST_EXTS_PROPERTY = - "hadoopbam.vcf.trust-exts"; - - /** - * Filter by region, like -L in SAMtools. Takes a comma-separated - * list of intervals, e.g. chr1:1-20000,chr2:12000-20000. For - * programmatic use {@link #setIntervals(Configuration, List)} should be preferred. - */ - public static final String INTERVALS_PROPERTY = "hadoopbam.vcf.intervals"; - - public static void setIntervals(Configuration conf, - List intervals) { - StringBuilder sb = new StringBuilder(); - for (Iterator it = intervals.iterator(); it.hasNext(); ) { - Locatable l = it.next(); - sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd())); - if (it.hasNext()) { - sb.append(","); - } - } - conf.set(INTERVALS_PROPERTY, sb.toString()); - } - - static List getIntervals(Configuration conf) { - return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY); - } - - private final Map formatMap; - private final boolean givenMap; - - private Configuration conf; - private boolean trustExts; - - /** Creates a new input format, which will use the - * Configuration from the first public method called. Thus this - * will behave as though constructed with a Configuration - * directly, but only after it has received it in - * createRecordReader (via the TaskAttemptContext) - * or isSplitable or getSplits (via the - * JobContext). Until then, other methods will throw an {@link - * IllegalStateException}. - * - * This constructor exists mainly as a convenience, e.g. so that - * VCFInputFormat can be used directly in - * Job.setInputFormatClass. - */ - public VCFInputFormat() { - this.formatMap = new HashMap(); - this.givenMap = false; - this.conf = null; - } - - /** Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from - * the given Configuration. - */ - public VCFInputFormat(Configuration conf) { - this.formatMap = new HashMap(); - this.conf = conf; - this.trustExts = conf.getBoolean(TRUST_EXTS_PROPERTY, true); - this.givenMap = false; - } - - /** Creates a new input format, trusting the given Map to - * define the file-to-format associations. Neither file paths nor their - * contents are looked at, only the Map is used. - * - *

The Map is not copied, so it should not be modified while - * this input format is in use!

- * */ - public VCFInputFormat(Map formatMap) { - this.formatMap = formatMap; - this.givenMap = true; - - // Arbitrary values. - this.conf = null; - this.trustExts = false; - } - - /** Returns the {@link VCFFormat} corresponding to the given path. Returns - * null if it cannot be determined even based on the file - * contents (unless future VCF/BCF formats are very different, this means - * that the path does not refer to a VCF or BCF file). - * - *

If this input format was constructed using a given - * Map<Path,VCFFormat> and the path is not contained - * within that map, throws an {@link IllegalArgumentException}.

- */ - public VCFFormat getFormat(final Path path) { - VCFFormat fmt = formatMap.get(path); - if (fmt != null || formatMap.containsKey(path)) - return fmt; - - if (givenMap) - throw new IllegalArgumentException( - "VCF format for '"+path+"' not in given map"); - - if (this.conf == null) - throw new IllegalStateException("Don't have a Configuration yet"); - - if (trustExts) { - final VCFFormat f = VCFFormat.inferFromFilePath(path); - if (f != null) { - - formatMap.put(path, f); - return f; - } - } - - try(InputStream is = path.getFileSystem(conf).open(path)) { - fmt = VCFFormat.inferFromData(is); - } catch (IOException e) {} - - formatMap.put(path, fmt); - return fmt; - } - - @Override - protected boolean isSplitable(JobContext context, Path filename) { - Configuration conf = context.getConfiguration(); - final CompressionCodec codec = - new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); - if (codec == null) { - return true; - } - if (codec instanceof BGZFCodec || codec instanceof BGZFEnhancedGzipCodec) { - boolean splittable; - try { - try (FSDataInputStream in = filename.getFileSystem(conf).open(filename)) { - splittable = BlockCompressedInputStream.isValidFile(new BufferedInputStream(in)); - } - } catch (IOException e) { - // can't determine if BGZF or GZIP, conservatively assume latter - splittable = false; - } - if (!splittable) { - logger.warn("{} is not splittable, consider using block-compressed gzip (BGZF)", filename); - } - return splittable; - } else if (codec instanceof GzipCodec) { - logger.warn("Using GzipCodec, which is not splittable, consider using block compressed gzip (BGZF) and BGZFCodec/BGZFEnhancedGzipCodec."); - } - return codec instanceof SplittableCompressionCodec; - } - - /** Returns a {@link BCFRecordReader} or {@link VCFRecordReader} as - * appropriate, initialized with the given parameters. - * - *

Throws {@link IllegalArgumentException} if the given input split is - * not a {@link FileVirtualSplit} or a {@link FileSplit}, or if the path - * referred to is not recognized as a VCF or BCF file (see {@link - * #getFormat}).

- */ - @Override public RecordReader - createRecordReader(InputSplit split, TaskAttemptContext ctx) - throws InterruptedException, IOException - { - final Path path; - if (split instanceof FileSplit) - path = ((FileSplit)split).getPath(); - else if (split instanceof FileVirtualSplit) - path = ((FileVirtualSplit)split).getPath(); - else - throw new IllegalArgumentException( - "split '"+split+"' has unknown type: cannot extract path"); - - if (this.conf == null) - this.conf = ctx.getConfiguration(); - - final VCFFormat fmt = getFormat(path); - if (fmt == null) - throw new IllegalArgumentException( - "unknown VCF format, cannot create RecordReader: "+path); - - final RecordReader rr; - - switch (fmt) { - case VCF: rr = new VCFRecordReader(); break; - case BCF: rr = new BCFRecordReader(); break; - default: assert false; return null; - } - - rr.initialize(split, ctx); - return rr; - } - - /** Defers to {@link BCFSplitGuesser} as appropriate for each individual - * path. VCF paths do not require special handling, so their splits are left - * unchanged. - */ - @Override public List getSplits(JobContext job) - throws IOException - { - if (this.conf == null) - this.conf = job.getConfiguration(); - - final List origSplits = super.getSplits(job); - - // We have to partition the splits by input format and hand the BCF ones - // over to getBCFSplits(). - - final List - bcfOrigSplits = new ArrayList(origSplits.size()); - final List - newSplits = new ArrayList(origSplits.size()); - - for (final InputSplit iSplit : origSplits) { - final FileSplit split = (FileSplit)iSplit; - - if (VCFFormat.BCF.equals(getFormat(split.getPath()))) - bcfOrigSplits.add(split); - else - newSplits.add(split); - } - fixBCFSplits(bcfOrigSplits, newSplits); - return filterByInterval(newSplits, conf); - } - - // The given FileSplits should all be for BCF files. Adds InputSplits - // aligned to record boundaries. Compressed BCF results in - // FileVirtualSplits, uncompressed in FileSplits. - private void fixBCFSplits( - List splits, List newSplits) - throws IOException - { - // addGuessedSplits() requires the given splits to be sorted by file - // path, so do so. Although FileInputFormat.getSplits() does, at the time - // of writing this, generate them in that order, we shouldn't rely on it. - Collections.sort(splits, new Comparator() { - public int compare(FileSplit a, FileSplit b) { - return a.getPath().compareTo(b.getPath()); - } - }); - - for (int i = 0; i < splits.size();) - i = addGuessedSplits(splits, i, newSplits); - } - - // Handles all the splits that share the Path of the one at index i, - // returning the next index to be used. - private int addGuessedSplits( - List splits, int i, List newSplits) - throws IOException - { - final Path path = splits.get(i).getPath(); - final SeekableStream sin = WrapSeekable.openPath(conf, path); - - final BCFSplitGuesser guesser = new BCFSplitGuesser(sin); - - final boolean isBGZF = guesser.isBGZF(); - - InputSplit prevSplit = null; - - for (; i < splits.size(); ++i) { - final FileSplit fspl = splits.get(i); - if (!fspl.getPath().equals(path)) - break; - - final String[] locs = fspl.getLocations(); - - final long beg = fspl.getStart(); - final long end = beg + fspl.getLength(); - - final long alignBeg = guesser.guessNextBCFRecordStart(beg, end); - - // As the guesser goes to the next BGZF block before looking for BCF - // records, the ending BGZF blocks have to always be traversed fully. - // Hence force the length to be 0xffff, the maximum possible. - final long alignEnd = isBGZF ? end << 16 | 0xffff : end; - - final long length = alignEnd - alignBeg; - - if (alignBeg == end) { - // No records detected in this split: merge it to the previous one. - // This could legitimately happen e.g. if we have a split that is - // so small that it only contains the middle part of a BGZF block. - // - // Of course, if it's the first split, then this is simply not a - // valid BCF file. - // - // FIXME: In theory, any number of splits could only contain parts - // of the BCF header before we start to see splits that contain BCF - // records. For now, we require that the split size is at least as - // big as the header and don't handle that case. - if (prevSplit == null) - throw new IOException("'" + path + "': no records in first "+ - "split: bad BCF file or tiny split size?"); - - if (isBGZF) { - ((FileVirtualSplit)prevSplit).setEndVirtualOffset(alignEnd); - continue; - } - prevSplit = new FileSplit(path, alignBeg, length, locs); - newSplits.remove(newSplits.size() - 1); - } else { - prevSplit = - isBGZF ? new FileVirtualSplit(path, alignBeg, alignEnd, locs) - : new FileSplit (path, alignBeg, length, locs); - } - newSplits.add(prevSplit); - } - - sin.close(); - return i; - } - - private List filterByInterval(List splits, Configuration conf) - throws IOException { - List intervals = getIntervals(conf); - if (intervals == null) { - return splits; - } - List blocks = new ArrayList<>(); - Set vcfFiles = new LinkedHashSet(); - for (InputSplit split : splits) { - if (split instanceof FileSplit) { - vcfFiles.add(((FileSplit) split).getPath()); - } else if (split instanceof FileVirtualSplit) { - vcfFiles.add(((FileVirtualSplit) split).getPath()); - } else { - throw new IllegalArgumentException( - "split '"+split+"' has unknown type: cannot extract path"); - } - } - for (Path vcfFile : vcfFiles) { - Path indexFile = vcfFile.suffix(TabixUtils.STANDARD_INDEX_EXTENSION); - FileSystem fs = vcfFile.getFileSystem(conf); - if (!fs.exists(indexFile)) { - logger.warn( - "No tabix index file found for {}, splits will not be filtered, which may be very inefficient", - indexFile); - return splits; - } - - try (InputStream in = new BlockCompressedInputStream(fs.open(indexFile))) { - TabixIndex index = new TabixIndex(in); - for (Locatable interval : intervals) { - String contig = interval.getContig(); - int intervalStart = interval.getStart(); - int intervalEnd = interval.getEnd(); - blocks.addAll(index.getBlocks(contig, intervalStart, intervalEnd)); - } - } - } - - // Use the blocks to filter the splits - List filteredSplits = new ArrayList(); - for (InputSplit split : splits) { - if (split instanceof FileSplit) { - FileSplit fileSplit = (FileSplit) split; - long splitStart = fileSplit.getStart() << 16; - long splitEnd = (fileSplit.getStart() + fileSplit.getLength()) << 16; - // if any block overlaps with the split, keep the split, but don't adjust its size - // as the BGZF block decompression is handled by BGZFCodec, not by the reader - // directly - for (Block block : blocks) { - long blockStart = block.getStartPosition(); - long blockEnd = block.getEndPosition(); - if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { - filteredSplits.add(split); - break; - } - } - } else { - FileVirtualSplit virtualSplit = (FileVirtualSplit) split; - long splitStart = virtualSplit.getStartVirtualOffset(); - long splitEnd = virtualSplit.getEndVirtualOffset(); - // if any block overlaps with the split, keep the split, but adjust the start and - // end to the maximally overlapping portion for all blocks that overlap - long newStart = Long.MAX_VALUE; - long newEnd = Long.MIN_VALUE; - boolean overlaps = false; - for (Block block : blocks) { - long blockStart = block.getStartPosition(); - long blockEnd = block.getEndPosition(); - if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { - long overlapStart = Math.max(splitStart, blockStart); - long overlapEnd = Math.min(splitEnd, blockEnd); - newStart = Math.min(newStart, overlapStart); - newEnd = Math.max(newEnd, overlapEnd); - overlaps = true; - } - } - if (overlaps) { - filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), newStart, newEnd, - virtualSplit.getLocations())); - } - } - } - return filteredSplits; - } - - private static boolean overlaps(long start, long end, long start2, long end2) { - return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || - (start >= start2 && end <= end2); - } +public class VCFInputFormat extends FileInputFormat { + + /** + * Whether file extensions are to be trusted, defaults to true. + * + * @see VCFFormat#inferFromFilePath + */ + public static final String TRUST_EXTS_PROPERTY = "hadoopbam.vcf.trust-exts"; + /** + * Filter by region, like -L in SAMtools. Takes a comma-separated list of intervals, + * e.g. chr1:1-20000,chr2:12000-20000. For programmatic use {@link + * #setIntervals(Configuration, List)} should be preferred. + */ + public static final String INTERVALS_PROPERTY = "hadoopbam.vcf.intervals"; + + private static final Logger logger = LoggerFactory.getLogger(VCFInputFormat.class); + private final Map formatMap; + private final boolean givenMap; + private Configuration conf; + private boolean trustExts; + + /** + * Creates a new input format, which will use the Configuration from the first public + * method called. Thus this will behave as though constructed with a Configuration + * directly, but only after it has received it in createRecordReader (via the + * TaskAttemptContext) or isSplitable or getSplits (via the + * JobContext). Until then, other methods will throw an {@link + * IllegalStateException}. + * + *

This constructor exists mainly as a convenience, e.g. so that VCFInputFormat + * can be used directly in Job.setInputFormatClass. + */ + public VCFInputFormat() { + this.formatMap = new HashMap(); + this.givenMap = false; + this.conf = null; + } + /** + * Creates a new input format, reading {@link #TRUST_EXTS_PROPERTY} from the given + * Configuration. + */ + public VCFInputFormat(Configuration conf) { + this.formatMap = new HashMap(); + this.conf = conf; + this.trustExts = conf.getBoolean(TRUST_EXTS_PROPERTY, true); + this.givenMap = false; + } + + /** + * Creates a new input format, trusting the given Map to define the file-to-format + * associations. Neither file paths nor their contents are looked at, only the Map is + * used. + * + *

The Map is not copied, so it should not be modified while this input format is + * in use! + */ + public VCFInputFormat(Map formatMap) { + this.formatMap = formatMap; + this.givenMap = true; + + // Arbitrary values. + this.conf = null; + this.trustExts = false; + } + + public static void setIntervals(Configuration conf, List intervals) { + StringBuilder sb = new StringBuilder(); + for (Iterator it = intervals.iterator(); it.hasNext(); ) { + Locatable l = it.next(); + sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd())); + if (it.hasNext()) { + sb.append(","); + } + } + conf.set(INTERVALS_PROPERTY, sb.toString()); + } + + static List getIntervals(Configuration conf) { + return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY); + } + + private static boolean overlaps(long start, long end, long start2, long end2) { + return (start2 >= start && start2 <= end) + || (end2 >= start && end2 <= end) + || (start >= start2 && end <= end2); + } + + /** + * Returns the {@link VCFFormat} corresponding to the given path. Returns null if it + * cannot be determined even based on the file contents (unless future VCF/BCF formats are very + * different, this means that the path does not refer to a VCF or BCF file). + * + *

If this input format was constructed using a given Map<Path,VCFFormat> + * and the path is not contained within that map, throws an {@link IllegalArgumentException}. + */ + public VCFFormat getFormat(final Path path) { + VCFFormat fmt = formatMap.get(path); + if (fmt != null || formatMap.containsKey(path)) { + return fmt; + } + + if (givenMap) { + throw new IllegalArgumentException("VCF format for '" + path + "' not in given map"); + } + + if (this.conf == null) { + throw new IllegalStateException("Don't have a Configuration yet"); + } + + if (trustExts) { + final VCFFormat f = VCFFormat.inferFromFilePath(path); + if (f != null) { + + formatMap.put(path, f); + return f; + } + } + + try (InputStream is = path.getFileSystem(conf).open(path)) { + fmt = VCFFormat.inferFromData(is); + } catch (IOException e) { + } + + formatMap.put(path, fmt); + return fmt; + } + + @Override + protected boolean isSplitable(JobContext context, Path filename) { + Configuration conf = context.getConfiguration(); + final CompressionCodec codec = + new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); + if (codec == null) { + return true; + } + if (codec instanceof BGZFCodec || codec instanceof BGZFEnhancedGzipCodec) { + boolean splittable; + try { + try (FSDataInputStream in = filename.getFileSystem(conf).open(filename)) { + splittable = BlockCompressedInputStream.isValidFile(new BufferedInputStream(in)); + } + } catch (IOException e) { + // can't determine if BGZF or GZIP, conservatively assume latter + splittable = false; + } + if (!splittable) { + logger.warn("{} is not splittable, consider using block-compressed gzip (BGZF)", filename); + } + return splittable; + } else if (codec instanceof GzipCodec) { + logger.warn( + "Using GzipCodec, which is not splittable, consider using block compressed gzip (BGZF) and BGZFCodec/BGZFEnhancedGzipCodec."); + } + return codec instanceof SplittableCompressionCodec; + } + + /** + * Returns a {@link BCFRecordReader} or {@link VCFRecordReader} as appropriate, initialized with + * the given parameters. + * + *

Throws {@link IllegalArgumentException} if the given input split is not a {@link + * FileVirtualSplit} or a {@link FileSplit}, or if the path referred to is not recognized as a VCF + * or BCF file (see {@link #getFormat}). + */ + @Override + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { + final Path path; + if (split instanceof FileSplit) { + path = ((FileSplit) split).getPath(); + } else if (split instanceof FileVirtualSplit) { + path = ((FileVirtualSplit) split).getPath(); + } else { + throw new IllegalArgumentException( + "split '" + split + "' has unknown type: cannot extract path"); + } + + if (this.conf == null) { + this.conf = ctx.getConfiguration(); + } + + final VCFFormat fmt = getFormat(path); + if (fmt == null) { + throw new IllegalArgumentException("unknown VCF format, cannot create RecordReader: " + path); + } + + final RecordReader rr; + + switch (fmt) { + case VCF: + rr = new VCFRecordReader(); + break; + case BCF: + rr = new BCFRecordReader(); + break; + default: + assert false; + return null; + } + + rr.initialize(split, ctx); + return rr; + } + + /** + * Defers to {@link BCFSplitGuesser} as appropriate for each individual path. VCF paths do not + * require special handling, so their splits are left unchanged. + */ + @Override + public List getSplits(JobContext job) throws IOException { + if (this.conf == null) { + this.conf = job.getConfiguration(); + } + + final List origSplits = super.getSplits(job); + + // We have to partition the splits by input format and hand the BCF ones + // over to getBCFSplits(). + + final List bcfOrigSplits = new ArrayList(origSplits.size()); + final List newSplits = new ArrayList(origSplits.size()); + + for (final InputSplit iSplit : origSplits) { + final FileSplit split = (FileSplit) iSplit; + + if (VCFFormat.BCF.equals(getFormat(split.getPath()))) { + bcfOrigSplits.add(split); + } else { + newSplits.add(split); + } + } + fixBCFSplits(bcfOrigSplits, newSplits); + return filterByInterval(newSplits, conf); + } + + // The given FileSplits should all be for BCF files. Adds InputSplits + // aligned to record boundaries. Compressed BCF results in + // FileVirtualSplits, uncompressed in FileSplits. + private void fixBCFSplits(List splits, List newSplits) throws IOException { + // addGuessedSplits() requires the given splits to be sorted by file + // path, so do so. Although FileInputFormat.getSplits() does, at the time + // of writing this, generate them in that order, we shouldn't rely on it. + Collections.sort( + splits, + new Comparator() { + public int compare(FileSplit a, FileSplit b) { + return a.getPath().compareTo(b.getPath()); + } + }); + + for (int i = 0; i < splits.size(); ) { + i = addGuessedSplits(splits, i, newSplits); + } + } + + // Handles all the splits that share the Path of the one at index i, + // returning the next index to be used. + private int addGuessedSplits(List splits, int i, List newSplits) + throws IOException { + final Path path = splits.get(i).getPath(); + final SeekableStream sin = WrapSeekable.openPath(conf, path); + + final BCFSplitGuesser guesser = new BCFSplitGuesser(sin); + + final boolean isBGZF = guesser.isBGZF(); + + InputSplit prevSplit = null; + + for (; i < splits.size(); ++i) { + final FileSplit fspl = splits.get(i); + if (!fspl.getPath().equals(path)) { + break; + } + + final String[] locs = fspl.getLocations(); + + final long beg = fspl.getStart(); + final long end = beg + fspl.getLength(); + + final long alignBeg = guesser.guessNextBCFRecordStart(beg, end); + + // As the guesser goes to the next BGZF block before looking for BCF + // records, the ending BGZF blocks have to always be traversed fully. + // Hence force the length to be 0xffff, the maximum possible. + final long alignEnd = isBGZF ? end << 16 | 0xffff : end; + + final long length = alignEnd - alignBeg; + + if (alignBeg == end) { + // No records detected in this split: merge it to the previous one. + // This could legitimately happen e.g. if we have a split that is + // so small that it only contains the middle part of a BGZF block. + // + // Of course, if it's the first split, then this is simply not a + // valid BCF file. + // + // FIXME: In theory, any number of splits could only contain parts + // of the BCF header before we start to see splits that contain BCF + // records. For now, we require that the split size is at least as + // big as the header and don't handle that case. + if (prevSplit == null) { + throw new IOException( + "'" + path + "': no records in first " + "split: bad BCF file or tiny split size?"); + } + + if (isBGZF) { + ((FileVirtualSplit) prevSplit).setEndVirtualOffset(alignEnd); + continue; + } + prevSplit = new FileSplit(path, alignBeg, length, locs); + newSplits.remove(newSplits.size() - 1); + } else { + prevSplit = + isBGZF + ? new FileVirtualSplit(path, alignBeg, alignEnd, locs) + : new FileSplit(path, alignBeg, length, locs); + } + newSplits.add(prevSplit); + } + + sin.close(); + return i; + } + + private List filterByInterval(List splits, Configuration conf) + throws IOException { + List intervals = getIntervals(conf); + if (intervals == null) { + return splits; + } + List blocks = new ArrayList<>(); + Set vcfFiles = new LinkedHashSet(); + for (InputSplit split : splits) { + if (split instanceof FileSplit) { + vcfFiles.add(((FileSplit) split).getPath()); + } else if (split instanceof FileVirtualSplit) { + vcfFiles.add(((FileVirtualSplit) split).getPath()); + } else { + throw new IllegalArgumentException( + "split '" + split + "' has unknown type: cannot extract path"); + } + } + for (Path vcfFile : vcfFiles) { + Path indexFile = vcfFile.suffix(TabixUtils.STANDARD_INDEX_EXTENSION); + FileSystem fs = vcfFile.getFileSystem(conf); + if (!fs.exists(indexFile)) { + logger.warn( + "No tabix index file found for {}, splits will not be filtered, which may be very inefficient", + indexFile); + return splits; + } + + try (InputStream in = new BlockCompressedInputStream(fs.open(indexFile))) { + TabixIndex index = new TabixIndex(in); + for (Locatable interval : intervals) { + String contig = interval.getContig(); + int intervalStart = interval.getStart(); + int intervalEnd = interval.getEnd(); + blocks.addAll(index.getBlocks(contig, intervalStart, intervalEnd)); + } + } + } + + // Use the blocks to filter the splits + List filteredSplits = new ArrayList(); + for (InputSplit split : splits) { + if (split instanceof FileSplit) { + FileSplit fileSplit = (FileSplit) split; + long splitStart = fileSplit.getStart() << 16; + long splitEnd = (fileSplit.getStart() + fileSplit.getLength()) << 16; + // if any block overlaps with the split, keep the split, but don't adjust its size + // as the BGZF block decompression is handled by BGZFCodec, not by the reader + // directly + for (Block block : blocks) { + long blockStart = block.getStartPosition(); + long blockEnd = block.getEndPosition(); + if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { + filteredSplits.add(split); + break; + } + } + } else { + FileVirtualSplit virtualSplit = (FileVirtualSplit) split; + long splitStart = virtualSplit.getStartVirtualOffset(); + long splitEnd = virtualSplit.getEndVirtualOffset(); + // if any block overlaps with the split, keep the split, but adjust the start and + // end to the maximally overlapping portion for all blocks that overlap + long newStart = Long.MAX_VALUE; + long newEnd = Long.MIN_VALUE; + boolean overlaps = false; + for (Block block : blocks) { + long blockStart = block.getStartPosition(); + long blockEnd = block.getEndPosition(); + if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { + long overlapStart = Math.max(splitStart, blockStart); + long overlapEnd = Math.min(splitEnd, blockEnd); + newStart = Math.min(newStart, overlapStart); + newEnd = Math.max(newEnd, overlapEnd); + overlaps = true; + } + } + if (overlaps) { + filteredSplits.add( + new FileVirtualSplit( + virtualSplit.getPath(), newStart, newEnd, virtualSplit.getLocations())); + } + } + } + return filteredSplits; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java index 36768f9..ef407e8 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VCFOutputFormat.java @@ -25,34 +25,35 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -/** An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for VCF and - * BCF files. Only locks down the value type and stores the output format - * requested. +/** + * An abstract {@link org.apache.hadoop.mapreduce.OutputFormat} for VCF and BCF files. Only locks + * down the value type and stores the output format requested. */ -public abstract class VCFOutputFormat - extends FileOutputFormat -{ - /** A string property defining the output format to use. The value is read - * directly by {@link VCFFormat#valueOf}. - */ - public static final String OUTPUT_VCF_FORMAT_PROPERTY = - "hadoopbam.vcf.output-format"; - - protected VCFFormat format; - - /** Creates a new output format, reading {@link #OUTPUT_VCF_FORMAT_PROPERTY} - * from the given Configuration. - */ - protected VCFOutputFormat(Configuration conf) { - final String fmtStr = conf.get(OUTPUT_VCF_FORMAT_PROPERTY); - - format = fmtStr == null ? null : VCFFormat.valueOf(fmtStr); - } - - /** Creates a new output format for the given VCF format. */ - protected VCFOutputFormat(VCFFormat fmt) { - if (fmt == null) - throw new IllegalArgumentException("null VCFFormat"); - format = fmt; - } +public abstract class VCFOutputFormat extends FileOutputFormat { + + /** + * A string property defining the output format to use. The value is read directly by {@link + * VCFFormat#valueOf}. + */ + public static final String OUTPUT_VCF_FORMAT_PROPERTY = "hadoopbam.vcf.output-format"; + + protected VCFFormat format; + + /** + * Creates a new output format, reading {@link #OUTPUT_VCF_FORMAT_PROPERTY} from the given + * Configuration. + */ + protected VCFOutputFormat(Configuration conf) { + final String fmtStr = conf.get(OUTPUT_VCF_FORMAT_PROPERTY); + + format = fmtStr == null ? null : VCFFormat.valueOf(fmtStr); + } + + /** Creates a new output format for the given VCF format. */ + protected VCFOutputFormat(VCFFormat fmt) { + if (fmt == null) { + throw new IllegalArgumentException("null VCFFormat"); + } + format = fmt; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VCFRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/VCFRecordReader.java index d21d70f..cacba28 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VCFRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VCFRecordReader.java @@ -34,7 +34,6 @@ import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFHeader; import java.io.IOException; -import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -57,162 +56,161 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** The key is the bitwise OR of the chromosome index in the upper 32 bits - * and the 0-based leftmost coordinate in the lower. +/** + * The key is the bitwise OR of the chromosome index in the upper 32 bits and the 0-based leftmost + * coordinate in the lower. * - * The chromosome index is based on the ordering of the contig lines in the VCF - * header. If a chromosome name that cannot be found in the contig lines is - * used, that name is instead hashed to form the upper part of the key. + *

The chromosome index is based on the ordering of the contig lines in the VCF header. If a + * chromosome name that cannot be found in the contig lines is used, that name is instead hashed to + * form the upper part of the key. */ -public class VCFRecordReader - extends RecordReader -{ - - private static final Logger logger = LoggerFactory.getLogger(VCFRecordReader.class); - - /** A String property corresponding to a ValidationStringency - * value. If set, the given stringency is used when any part of the - * Hadoop-BAM library reads VCF. - */ - public static final String VALIDATION_STRINGENCY_PROPERTY = - "hadoopbam.vcfrecordreader.validation-stringency"; - - static ValidationStringency getValidationStringency( - final Configuration conf) - { - final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY); - return p == null ? ValidationStringency.STRICT : ValidationStringency.valueOf(p); - } - - public static void setValidationStringency( - final Configuration conf, - final ValidationStringency stringency) - { - conf.set(VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); - } - - - private final LongWritable key = new LongWritable(); - private final VariantContextWritable vc = new VariantContextWritable(); - - private VCFCodec codec = new VCFCodec(); - private LineRecordReader lineRecordReader = new LineRecordReader(); - - private VCFHeader header; - - private final Map contigDict = - new HashMap(); - - private List intervals; - private OverlapDetector overlapDetector; - - private ValidationStringency stringency; - - @Override public void initialize(InputSplit spl, TaskAttemptContext ctx) - throws IOException - { - final FileSplit split = (FileSplit)spl; - - final Path file = split.getPath(); - final FileSystem fs = file.getFileSystem(ctx.getConfiguration()); - - final FSDataInputStream ins = fs.open(file); - - CompressionCodec compressionCodec = - new CompressionCodecFactory(ctx.getConfiguration()).getCodec(file); - AsciiLineReader reader; - if (compressionCodec == null) { - reader = new AsciiLineReader(ins); - } else { - Decompressor decompressor = CodecPool.getDecompressor(compressionCodec); - CompressionInputStream in = compressionCodec.createInputStream(ins, - decompressor); - reader = new AsciiLineReader(in); - } - - AsciiLineReaderIterator it = new AsciiLineReaderIterator(reader); - - final FeatureCodecHeader h = codec.readHeader(it); - if (h == null || !(h.getHeaderValue() instanceof VCFHeader)) - throw new IOException("No VCF header found in "+ file); - - header = (VCFHeader) h.getHeaderValue(); - - contigDict.clear(); - int i = 0; - for (final VCFContigHeaderLine contig : header.getContigLines()) - contigDict.put(contig.getID(), i++); - - lineRecordReader.initialize(spl, ctx); - - intervals = VCFInputFormat.getIntervals(ctx.getConfiguration()); - if (intervals != null) { - overlapDetector = OverlapDetector.create(intervals); - } - - stringency = VCFRecordReader.getValidationStringency(ctx.getConfiguration()); - } - @Override public void close() throws IOException { lineRecordReader.close(); } - - @Override public float getProgress() throws IOException { - return lineRecordReader.getProgress(); - } - - @Override public LongWritable getCurrentKey () { return key; } - @Override public VariantContextWritable getCurrentValue() { return vc; } - - @Override public boolean nextKeyValue() throws IOException { - while (true) { - String line; - while (true) { - if (!lineRecordReader.nextKeyValue()) { - return false; - } - line = lineRecordReader.getCurrentValue().toString(); - if (!line.startsWith("#")) { - break; - } - } - - final VariantContext v; - try { - v = codec.decode(line); - } catch (TribbleException e) { - if (stringency == ValidationStringency.STRICT) { - if (logger.isErrorEnabled()) { - logger.error("Parsing line {} failed with {}.", line, e); - } - throw e; - } else { - if (stringency == ValidationStringency.LENIENT && - logger.isWarnEnabled()) { - logger.warn("Parsing line {} failed with {}. Skipping...", - line, e); - } - continue; - } - } - - if (!overlaps(v)) { - continue; - } - - Integer chromIdx = contigDict.get(v.getContig()); - if (chromIdx == null) - chromIdx = (int) MurmurHash3.murmurhash3(v.getContig(), 0); - - key.set((long) chromIdx << 32 | (long) (v.getStart() - 1)); - vc.set(v, header); - - return true; - } - } - - private boolean overlaps(VariantContext v) { - if (intervals == null) { - return true; - } - final Interval interval = new Interval(v.getContig(), v.getStart(), v.getEnd()); - return overlapDetector.overlapsAny(interval); - } +public class VCFRecordReader extends RecordReader { + + /** + * A String property corresponding to a ValidationStringency value. If set, the given stringency + * is used when any part of the Hadoop-BAM library reads VCF. + */ + public static final String VALIDATION_STRINGENCY_PROPERTY = + "hadoopbam.vcfrecordreader.validation-stringency"; + + private static final Logger logger = LoggerFactory.getLogger(VCFRecordReader.class); + private final LongWritable key = new LongWritable(); + private final VariantContextWritable vc = new VariantContextWritable(); + private final Map contigDict = new HashMap(); + private VCFCodec codec = new VCFCodec(); + private LineRecordReader lineRecordReader = new LineRecordReader(); + private VCFHeader header; + private List intervals; + private OverlapDetector overlapDetector; + private ValidationStringency stringency; + + static ValidationStringency getValidationStringency(final Configuration conf) { + final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY); + return p == null ? ValidationStringency.STRICT : ValidationStringency.valueOf(p); + } + + public static void setValidationStringency( + final Configuration conf, final ValidationStringency stringency) { + conf.set(VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); + } + + @Override + public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { + final FileSplit split = (FileSplit) spl; + + final Path file = split.getPath(); + final FileSystem fs = file.getFileSystem(ctx.getConfiguration()); + + final FSDataInputStream ins = fs.open(file); + + CompressionCodec compressionCodec = + new CompressionCodecFactory(ctx.getConfiguration()).getCodec(file); + AsciiLineReader reader; + if (compressionCodec == null) { + reader = new AsciiLineReader(ins); + } else { + Decompressor decompressor = CodecPool.getDecompressor(compressionCodec); + CompressionInputStream in = compressionCodec.createInputStream(ins, decompressor); + reader = new AsciiLineReader(in); + } + + AsciiLineReaderIterator it = new AsciiLineReaderIterator(reader); + + final FeatureCodecHeader h = codec.readHeader(it); + if (h == null || !(h.getHeaderValue() instanceof VCFHeader)) { + throw new IOException("No VCF header found in " + file); + } + + header = (VCFHeader) h.getHeaderValue(); + + contigDict.clear(); + int i = 0; + for (final VCFContigHeaderLine contig : header.getContigLines()) { + contigDict.put(contig.getID(), i++); + } + + lineRecordReader.initialize(spl, ctx); + + intervals = VCFInputFormat.getIntervals(ctx.getConfiguration()); + if (intervals != null) { + overlapDetector = OverlapDetector.create(intervals); + } + + stringency = VCFRecordReader.getValidationStringency(ctx.getConfiguration()); + } + + @Override + public void close() throws IOException { + lineRecordReader.close(); + } + + @Override + public float getProgress() throws IOException { + return lineRecordReader.getProgress(); + } + + @Override + public LongWritable getCurrentKey() { + return key; + } + + @Override + public VariantContextWritable getCurrentValue() { + return vc; + } + + @Override + public boolean nextKeyValue() throws IOException { + while (true) { + String line; + while (true) { + if (!lineRecordReader.nextKeyValue()) { + return false; + } + line = lineRecordReader.getCurrentValue().toString(); + if (!line.startsWith("#")) { + break; + } + } + + final VariantContext v; + try { + v = codec.decode(line); + } catch (TribbleException e) { + if (stringency == ValidationStringency.STRICT) { + if (logger.isErrorEnabled()) { + logger.error("Parsing line {} failed with {}.", line, e); + } + throw e; + } else { + if (stringency == ValidationStringency.LENIENT && logger.isWarnEnabled()) { + logger.warn("Parsing line {} failed with {}. Skipping...", line, e); + } + continue; + } + } + + if (!overlaps(v)) { + continue; + } + + Integer chromIdx = contigDict.get(v.getContig()); + if (chromIdx == null) { + chromIdx = (int) MurmurHash3.murmurhash3(v.getContig(), 0); + } + + key.set((long) chromIdx << 32 | (long) (v.getStart() - 1)); + vc.set(v, header); + + return true; + } + } + + private boolean overlaps(VariantContext v) { + if (intervals == null) { + return true; + } + final Interval interval = new Interval(v.getContig(), v.getStart(), v.getEnd()); + return overlapDetector.overlapsAny(interval); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/VCFRecordWriter.java index 0155096..0c12d43 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VCFRecordWriter.java @@ -23,130 +23,114 @@ package org.seqdoop.hadoop_bam; import htsjdk.tribble.FeatureCodecHeader; +import htsjdk.tribble.readers.AsciiLineReader; +import htsjdk.tribble.readers.AsciiLineReaderIterator; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFHeader; import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.io.StringWriter; -import java.io.Writer; - import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; -import htsjdk.tribble.readers.AsciiLineReader; -import htsjdk.tribble.readers.AsciiLineReaderIterator; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.variantcontext.GenotypesContext; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -/** A base {@link RecordWriter} for VCF. +/** + * A base {@link RecordWriter} for VCF. * - *

Handles the output stream, writing the header if requested, and provides - * the {@link #writeRecord} function for subclasses.

+ *

Handles the output stream, writing the header if requested, and provides the {@link + * #writeRecord} function for subclasses. */ -public abstract class VCFRecordWriter - extends RecordWriter -{ - private VCFCodec codec = new VCFCodec(); - private VariantContextWriter writer; - - private LazyVCFGenotypesContext.HeaderDataCache vcfHeaderDataCache = - new LazyVCFGenotypesContext.HeaderDataCache(); - private LazyBCFGenotypesContext.HeaderDataCache bcfHeaderDataCache = - new LazyBCFGenotypesContext.HeaderDataCache(); - - /** A VCFHeader is read from the input Path. */ - public VCFRecordWriter( - Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) - throws IOException - { - final AsciiLineReader r = new AsciiLineReader( - input.getFileSystem(ctx.getConfiguration()).open(input)); - - final FeatureCodecHeader h = codec.readHeader(new AsciiLineReaderIterator(r)); - if (h == null || !(h.getHeaderValue() instanceof VCFHeader)) - throw new IOException("No VCF header found in "+ input); - - r.close(); - - init(output, (VCFHeader) h.getHeaderValue(), writeHeader, ctx); - } - public VCFRecordWriter( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader, ctx); - } - public VCFRecordWriter( - OutputStream output, VCFHeader header, boolean writeHeader) - throws IOException - { - init(output, header, writeHeader, null); - } - - // Working around not being able to call a constructor other than as the - // first statement... - private void init( - Path output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - init( - output.getFileSystem(ctx.getConfiguration()).create(output), - header, writeHeader, ctx); - } - private void init( - OutputStream output, VCFHeader header, boolean writeHeader, - TaskAttemptContext ctx) - throws IOException - { - final StoppableOutputStream stopOut = - new StoppableOutputStream(!writeHeader, output); - - writer = createVariantContextWriter(ctx == null ? null : ctx.getConfiguration(), - stopOut); - - writer.writeHeader(header); - stopOut.stopped = false; - - setInputHeader(header); - } - - protected VariantContextWriter createVariantContextWriter(Configuration conf, - OutputStream out) { - return new VariantContextWriterBuilder().clearOptions() - .setOutputStream(out).build(); - } - - @Override public void close(TaskAttemptContext ctx) throws IOException { - writer.close(); - } - - /** Used for lazy decoding of genotype data. Of course, each input record - * may have a different header, but we currently only support one header - * here... This is in part due to the fact that it's not clear what the best - * solution is. */ - public void setInputHeader(VCFHeader header) { - vcfHeaderDataCache.setHeader(header); - bcfHeaderDataCache.setHeader(header); - } - - protected void writeRecord(VariantContext vc) { - final GenotypesContext gc = vc.getGenotypes(); - if (gc instanceof LazyParsingGenotypesContext) - ((LazyParsingGenotypesContext)gc).getParser().setHeaderDataCache( - gc instanceof LazyVCFGenotypesContext ? vcfHeaderDataCache - : bcfHeaderDataCache); - - writer.add(vc); - } +public abstract class VCFRecordWriter extends RecordWriter { + + private VCFCodec codec = new VCFCodec(); + private VariantContextWriter writer; + + private LazyVCFGenotypesContext.HeaderDataCache vcfHeaderDataCache = + new LazyVCFGenotypesContext.HeaderDataCache(); + private LazyBCFGenotypesContext.HeaderDataCache bcfHeaderDataCache = + new LazyBCFGenotypesContext.HeaderDataCache(); + + /** A VCFHeader is read from the input Path. */ + public VCFRecordWriter(Path output, Path input, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + final AsciiLineReader r = + new AsciiLineReader(input.getFileSystem(ctx.getConfiguration()).open(input)); + + final FeatureCodecHeader h = codec.readHeader(new AsciiLineReaderIterator(r)); + if (h == null || !(h.getHeaderValue() instanceof VCFHeader)) { + throw new IOException("No VCF header found in " + input); + } + + r.close(); + + init(output, (VCFHeader) h.getHeaderValue(), writeHeader, ctx); + } + + public VCFRecordWriter(Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader, ctx); + } + + public VCFRecordWriter(OutputStream output, VCFHeader header, boolean writeHeader) + throws IOException { + init(output, header, writeHeader, null); + } + + // Working around not being able to call a constructor other than as the + // first statement... + private void init(Path output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + init(output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader, ctx); + } + + private void init( + OutputStream output, VCFHeader header, boolean writeHeader, TaskAttemptContext ctx) + throws IOException { + final StoppableOutputStream stopOut = new StoppableOutputStream(!writeHeader, output); + + writer = createVariantContextWriter(ctx == null ? null : ctx.getConfiguration(), stopOut); + + writer.writeHeader(header); + stopOut.stopped = false; + + setInputHeader(header); + } + + protected VariantContextWriter createVariantContextWriter(Configuration conf, OutputStream out) { + return new VariantContextWriterBuilder().clearOptions().setOutputStream(out).build(); + } + + @Override + public void close(TaskAttemptContext ctx) throws IOException { + writer.close(); + } + + /** + * Used for lazy decoding of genotype data. Of course, each input record may have a different + * header, but we currently only support one header here... This is in part due to the fact that + * it's not clear what the best solution is. + */ + public void setInputHeader(VCFHeader header) { + vcfHeaderDataCache.setHeader(header); + bcfHeaderDataCache.setHeader(header); + } + + protected void writeRecord(VariantContext vc) { + final GenotypesContext gc = vc.getGenotypes(); + if (gc instanceof LazyParsingGenotypesContext) { + ((LazyParsingGenotypesContext) gc) + .getParser() + .setHeaderDataCache( + gc instanceof LazyVCFGenotypesContext ? vcfHeaderDataCache : bcfHeaderDataCache); + } + + writer.add(vc); + } } // We must always call writer.writeHeader() because the writer requires @@ -158,20 +142,32 @@ protected void writeRecord(VariantContext vc) { // does any buffering of its own and doesn't flush after writing the // header, this isn't as easy as this. final class StoppableOutputStream extends FilterOutputStream { - public boolean stopped; - - public StoppableOutputStream(boolean startStopped, OutputStream out) { - super(out); - stopped = startStopped; - } - - @Override public void write(int b) throws IOException { - if (!stopped) super.write(b); - } - @Override public void write(byte[] b) throws IOException { - if (!stopped) super.write(b); - } - @Override public void write(byte[] b, int off, int len) throws IOException { - if (!stopped) super.write(b, off, len); - } + + public boolean stopped; + + public StoppableOutputStream(boolean startStopped, OutputStream out) { + super(out); + stopped = startStopped; + } + + @Override + public void write(int b) throws IOException { + if (!stopped) { + super.write(b); + } + } + + @Override + public void write(byte[] b) throws IOException { + if (!stopped) { + super.write(b); + } + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + if (!stopped) { + super.write(b, off, len); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VariantContextCodec.java b/src/main/java/org/seqdoop/hadoop_bam/VariantContextCodec.java index 5e71315..e1299ed 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VariantContextCodec.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VariantContextCodec.java @@ -22,315 +22,342 @@ package org.seqdoop.hadoop_bam; -import java.io.DataOutput; +import htsjdk.tribble.util.ParsingUtils; +import htsjdk.variant.bcf2.BCF2Codec; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFEncoder; +import htsjdk.variant.vcf.VCFHeader; import java.io.DataInput; +import java.io.DataOutput; import java.io.IOException; +import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.lang.reflect.Array; - -import htsjdk.tribble.util.ParsingUtils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.bcf2.BCF2Codec; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFEncoder; -import htsjdk.variant.vcf.VCFHeader; // See the comment in VariantContextWritable explaining what this is used for. public final class VariantContextCodec { - public static void write(final DataOutput out, final VariantContext vc) - throws IOException - { - Object genotypesData; - int numGenotypes; - if (vc.getGenotypes().isLazyWithData()) { - final htsjdk.variant.variantcontext.LazyGenotypesContext gc = - (htsjdk.variant.variantcontext.LazyGenotypesContext) - vc.getGenotypes(); - - genotypesData = gc.getUnparsedGenotypeData(); - numGenotypes = gc.size(); - } - else if (vc instanceof VariantContextWithHeader) { - final VCFHeader header = ((VariantContextWithHeader)vc).getHeader(); - - if (header == null) { - throw new IllegalStateException( "Header not set inside VariantContextWithHeader" ); - } - - final List genotypeAttributeKeys = vc.calcVCFGenotypeKeys(header); - final StringBuilder builder = new StringBuilder(); - if ( ! genotypeAttributeKeys.isEmpty()) { - // TODO: the VCFEncoder equivalent of this code checks for missing header fields here. do we care? - - final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); - - builder.append(VCFConstants.FIELD_SEPARATOR); - builder.append(genotypeFormatString); - - final VCFEncoder encoder = new VCFEncoder(header, true, false); - final Map alleleStrings = encoder.buildAlleleStrings(vc); - encoder.addGenotypeData(vc, alleleStrings, genotypeAttributeKeys, builder); - } - genotypesData = builder.toString(); - numGenotypes = vc.getGenotypes().size(); + public static void write(final DataOutput out, final VariantContext vc) throws IOException { + Object genotypesData; + int numGenotypes; + if (vc.getGenotypes().isLazyWithData()) { + final htsjdk.variant.variantcontext.LazyGenotypesContext gc = + (htsjdk.variant.variantcontext.LazyGenotypesContext) vc.getGenotypes(); + + genotypesData = gc.getUnparsedGenotypeData(); + numGenotypes = gc.size(); + } else if (vc instanceof VariantContextWithHeader) { + + final VCFHeader header = ((VariantContextWithHeader) vc).getHeader(); + + if (header == null) { + throw new IllegalStateException("Header not set inside VariantContextWithHeader"); + } + + final List genotypeAttributeKeys = vc.calcVCFGenotypeKeys(header); + final StringBuilder builder = new StringBuilder(); + if (!genotypeAttributeKeys.isEmpty()) { + // TODO: the VCFEncoder equivalent of this code checks for missing header fields here. do + // we care? + + final String genotypeFormatString = + ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys); + + builder.append(VCFConstants.FIELD_SEPARATOR); + builder.append(genotypeFormatString); + + final VCFEncoder encoder = new VCFEncoder(header, true, false); + final Map alleleStrings = encoder.buildAlleleStrings(vc); + encoder.addGenotypeData(vc, alleleStrings, genotypeAttributeKeys, builder); + } + genotypesData = builder.toString(); + numGenotypes = vc.getGenotypes().size(); + } else { + throw new IllegalStateException( + "Cannot write fully decoded VariantContext: need lazy genotypes or VCF Header"); + } + + if (!(genotypesData instanceof String || genotypesData instanceof BCF2Codec.LazyData)) { + throw new IllegalStateException( + "Unrecognized unparsed genotype data, expected String or " + + "BCF2Codec.LazyData: " + + genotypesData.getClass()); + } + + final byte[] chrom = vc.getContig().getBytes("UTF-8"); + out.writeInt(chrom.length); + out.write(chrom); + + out.writeInt(vc.getStart()); + out.writeInt(vc.getEnd()); + + final byte[] id = vc.getID().getBytes("UTF-8"); + out.writeInt(id.length); + out.write(id); + + final List alleles = vc.getAlleles(); + out.writeInt(alleles.size()); + for (final Allele allele : alleles) { + final byte[] b = allele.getDisplayBases(); + out.writeInt(b.length); + out.write(b); + } + + if (vc.hasLog10PError()) { + out.writeFloat((float) vc.getLog10PError()); + } else { + // The "missing value" used in BCF2, a signaling NaN. + out.writeInt(0x7f800001); + } + + if (vc.isFiltered()) { + final Set filters = vc.getFilters(); + out.writeInt(filters.size()); + for (final String s : filters) { + final byte[] b = s.getBytes("UTF-8"); + out.writeInt(b.length); + out.write(b); + } + } else { + out.writeInt(vc.filtersWereApplied() ? -1 : -2); + } + + final Map attrs = vc.getAttributes(); + out.writeInt(attrs.size()); + for (final Map.Entry ent : attrs.entrySet()) { + final byte[] k = ent.getKey().getBytes("UTF-8"); + out.writeInt(k.length); + out.write(k); + + encodeAttrVal(out, ent.getValue()); + } + + out.writeInt(numGenotypes); + + if (genotypesData instanceof String) { + out.writeByte(0); + final byte[] genob = ((String) genotypesData).getBytes("UTF-8"); + out.writeInt(genob.length); + out.write(genob); + } else { + assert genotypesData instanceof BCF2Codec.LazyData; + final BCF2Codec.LazyData data = (BCF2Codec.LazyData) genotypesData; + out.writeByte(1); + out.writeInt(data.bytes.length); + out.write(data.bytes); + out.writeInt(data.nGenotypeFields); + } + } + + public static VariantContext read(final DataInput in) throws IOException { + final VariantContextBuilder builder = new VariantContextBuilder(); + + int count, len; + byte[] b; + + len = in.readInt(); + b = new byte[len]; + in.readFully(b); + final String chrom = new String(b, "UTF-8"); + builder.chr(chrom); + + final int start = in.readInt(); + builder.start(start); + builder.stop(in.readInt()); + + len = in.readInt(); + if (len == 0) { + builder.noID(); + } else { + if (len > b.length) { + b = new byte[len]; + } + in.readFully(b, 0, len); + builder.id(new String(b, 0, len, "UTF-8")); + } + + count = in.readInt(); + final List alleles = new ArrayList(count); + for (int i = 0; i < count; ++i) { + len = in.readInt(); + if (len > b.length) { + b = new byte[len]; + } + in.readFully(b, 0, len); + alleles.add(Allele.create(Arrays.copyOf(b, len), i == 0)); + } + builder.alleles(alleles); + + final int qualInt = in.readInt(); + builder.log10PError( + qualInt == 0x7f800001 ? VariantContext.NO_LOG10_PERROR : Float.intBitsToFloat(qualInt)); + + count = in.readInt(); + switch (count) { + case -2: + builder.unfiltered(); + break; + case -1: + builder.passFilters(); + break; + default: + while (count-- > 0) { + len = in.readInt(); + if (len > b.length) { + b = new byte[len]; + } + in.readFully(b, 0, len); + builder.filter(new String(b, 0, len, "UTF-8")); } - else { - throw new IllegalStateException( "Cannot write fully decoded VariantContext: need lazy genotypes or VCF Header" ); + break; + } + + count = in.readInt(); + final Map attrs = new HashMap(count, 1); + while (count-- > 0) { + len = in.readInt(); + if (len > b.length) { + b = new byte[len]; + } + in.readFully(b, 0, len); + attrs.put(new String(b, 0, len, "UTF-8"), decodeAttrVal(in)); + } + builder.attributes(attrs); + + count = in.readInt(); + final byte genoType = in.readByte(); + len = in.readInt(); + + // Resize b even if it's already big enough, minimizing the amount of + // memory LazyGenotypesContext hangs on to. + b = new byte[len]; + in.readFully(b); + + switch (genoType) { + case 0: + builder.genotypesNoValidation(new LazyVCFGenotypesContext(alleles, chrom, start, b, count)); + break; + + case 1: + builder.genotypesNoValidation(new LazyBCFGenotypesContext(alleles, in.readInt(), b, count)); + break; + + default: + throw new IOException("Invalid genotypes type identifier: cannot decode"); + } + + return builder.make(); + } + + private static void encodeAttrVal(final DataOutput out, final Object v) throws IOException { + if (v instanceof Integer) { + out.writeByte(AttrType.INT.toByte()); + out.writeInt((Integer) v); + } else if (v instanceof Float) { + out.writeByte(AttrType.FLOAT.toByte()); + out.writeFloat((Float) v); + } else if (v instanceof Double) { + out.writeByte(AttrType.DOUBLE.toByte()); + out.writeDouble((Double) v); + } else if (v instanceof Boolean) { + out.writeByte(AttrType.BOOL.toByte()); + out.writeBoolean((Boolean) v); + } else if (v instanceof Character) { + out.writeByte(AttrType.CHAR.toByte()); + out.writeChar((Character) v); + + } else if (v instanceof List) { + encodeAttrVal(out, ((List) v).toArray()); + + } else if (v != null && v.getClass().isArray()) { + out.writeByte(AttrType.ARRAY.toByte()); + final int length = Array.getLength(v); + out.writeInt(length); + for (int i = 0; i < length; ++i) { + encodeAttrVal(out, Array.get(v, i)); + } + + } else { + out.writeByte(AttrType.STRING.toByte()); + if (v == null) { + out.writeInt(0); + } else { + final byte[] b = v.toString().getBytes("UTF-8"); + out.writeInt(b.length); + out.write(b); + } + } + } + + private static Object decodeAttrVal(final DataInput in) throws IOException { + switch (AttrType.fromByte(in.readByte())) { + case INT: + return in.readInt(); + case FLOAT: + return in.readFloat(); + case DOUBLE: + return in.readDouble(); + case BOOL: + return in.readBoolean(); + case CHAR: + return in.readChar(); + case ARRAY: + { + // VariantContext.fullyDecodeAttributes() checks for "instanceof + // List" so we have to return a List, not an array, here. + int len = in.readInt(); + final List os = new ArrayList(len); + while (len-- > 0) { + os.add(decodeAttrVal(in)); + } + return os; } - - if (!(genotypesData instanceof String || genotypesData instanceof BCF2Codec.LazyData)) - throw new IllegalStateException( - "Unrecognized unparsed genotype data, expected String or "+ - "BCF2Codec.LazyData: "+ genotypesData.getClass()); - - final byte[] chrom = vc.getContig().getBytes("UTF-8"); - out.writeInt(chrom.length); - out.write (chrom); - - out.writeInt(vc.getStart()); - out.writeInt(vc.getEnd()); - - final byte[] id = vc.getID().getBytes("UTF-8"); - out.writeInt(id.length); - out.write (id); - - final List alleles = vc.getAlleles(); - out.writeInt(alleles.size()); - for (final Allele allele : alleles) { - final byte[] b = allele.getDisplayBases(); - out.writeInt(b.length); - out.write (b); - } - - if (vc.hasLog10PError()) - out.writeFloat((float)vc.getLog10PError()); - else { - // The "missing value" used in BCF2, a signaling NaN. - out.writeInt(0x7f800001); - } - - if (vc.isFiltered()) { - final Set filters = vc.getFilters(); - out.writeInt(filters.size()); - for (final String s : filters) { - final byte[] b = s.getBytes("UTF-8"); - out.writeInt(b.length); - out.write (b); - } - } else - out.writeInt(vc.filtersWereApplied() ? -1 : -2); - - final Map attrs = vc.getAttributes(); - out.writeInt(attrs.size()); - for (final Map.Entry ent : attrs.entrySet()) { - final byte[] k = ent.getKey().getBytes("UTF-8"); - out.writeInt(k.length); - out.write (k); - - encodeAttrVal(out, ent.getValue()); - } - - out.writeInt(numGenotypes); - - if (genotypesData instanceof String) { - out.writeByte(0); - final byte[] genob = ((String)genotypesData).getBytes("UTF-8"); - out.writeInt(genob.length); - out.write (genob); - } else { - assert genotypesData instanceof BCF2Codec.LazyData; - final BCF2Codec.LazyData data = (BCF2Codec.LazyData)genotypesData; - out.writeByte(1); - out.writeInt(data.bytes.length); - out.write (data.bytes); - out.writeInt(data.nGenotypeFields); - } - } - - public static VariantContext read(final DataInput in) throws IOException { - final VariantContextBuilder builder = new VariantContextBuilder(); - - int count, len; - byte[] b; - - len = in.readInt(); - b = new byte[len]; - in.readFully(b); - final String chrom = new String(b, "UTF-8"); - builder.chr(chrom); - - final int start = in.readInt(); - builder.start(start); - builder.stop (in.readInt()); - - len = in.readInt(); - if (len == 0) - builder.noID(); - else { - if (len > b.length) b = new byte[len]; - in.readFully(b, 0, len); - builder.id(new String(b, 0, len, "UTF-8")); - } - - count = in.readInt(); - final List alleles = new ArrayList(count); - for (int i = 0; i < count; ++i) { - len = in.readInt(); - if (len > b.length) b = new byte[len]; - in.readFully(b, 0, len); - alleles.add(Allele.create(Arrays.copyOf(b, len), i == 0)); - } - builder.alleles(alleles); - - final int qualInt = in.readInt(); - builder.log10PError( - qualInt == 0x7f800001 - ? VariantContext.NO_LOG10_PERROR - : Float.intBitsToFloat(qualInt)); - - count = in.readInt(); - switch (count) { - case -2: builder.unfiltered(); break; - case -1: builder.passFilters(); break; - default: - while (count-- > 0) { - len = in.readInt(); - if (len > b.length) b = new byte[len]; - in.readFully(b, 0, len); - builder.filter(new String(b, 0, len, "UTF-8")); - } - break; - } - - count = in.readInt(); - final Map attrs = new HashMap(count, 1); - while (count-- > 0) { - len = in.readInt(); - if (len > b.length) b = new byte[len]; - in.readFully(b, 0, len); - attrs.put(new String(b, 0, len, "UTF-8"), decodeAttrVal(in)); - } - builder.attributes(attrs); - - count = in.readInt(); - final byte genoType = in.readByte(); - len = in.readInt(); - - // Resize b even if it's already big enough, minimizing the amount of - // memory LazyGenotypesContext hangs on to. - b = new byte[len]; - in.readFully(b); - - switch (genoType) { - case 0: - builder.genotypesNoValidation( - new LazyVCFGenotypesContext(alleles, chrom, start, b, count)); - break; - - case 1: - builder.genotypesNoValidation( - new LazyBCFGenotypesContext(alleles, in.readInt(), b, count)); - break; - - default: - throw new IOException( - "Invalid genotypes type identifier: cannot decode"); - } - - return builder.make(); - } - - // The VCF 4.1 spec says: "Integer, Float, Flag, Character, and String". But - // there can be many, so we also have ARRAY. - // - // In addition, VariantContext seems to represent some/all floats as doubles - // at least when reading from BCF, and at least BCF2FieldEncoder assumes - // them to be of class Double so we have to preserve doubles and thus must - // have DOUBLE. - private enum AttrType { - INT, FLOAT, BOOL, CHAR, STRING, ARRAY, DOUBLE; - - public byte toByte() { return (byte)ordinal(); } - - private static final AttrType[] values = values(); - public static AttrType fromByte(byte b) { return values[b]; } - } - - private static void encodeAttrVal(final DataOutput out, final Object v) - throws IOException - { - if (v instanceof Integer) { - out.writeByte(AttrType.INT.toByte()); - out.writeInt ((Integer)v); - } else if (v instanceof Float) { - out.writeByte (AttrType.FLOAT.toByte()); - out.writeFloat((Float)v); - } else if (v instanceof Double) { - out.writeByte (AttrType.DOUBLE.toByte()); - out.writeDouble((Double)v); - } else if (v instanceof Boolean) { - out.writeByte (AttrType.BOOL.toByte()); - out.writeBoolean((Boolean)v); - } else if (v instanceof Character) { - out.writeByte(AttrType.CHAR.toByte()); - out.writeChar((Character)v); - - } else if (v instanceof List) { - encodeAttrVal(out, ((List)v).toArray()); - - } else if (v != null && v.getClass().isArray()) { - out.writeByte(AttrType.ARRAY.toByte()); - final int length = Array.getLength(v); - out.writeInt(length); - for (int i = 0; i < length; ++i) - encodeAttrVal(out, Array.get(v, i)); - - } else { - out.writeByte(AttrType.STRING.toByte()); - if (v == null) - out.writeInt(0); - else { - final byte[] b = v.toString().getBytes("UTF-8"); - out.writeInt(b.length); - out.write (b); - } - } - } - - private static Object decodeAttrVal(final DataInput in) throws IOException { - switch (AttrType.fromByte(in.readByte())) { - case INT: return in.readInt(); - case FLOAT: return in.readFloat(); - case DOUBLE: return in.readDouble(); - case BOOL: return in.readBoolean(); - case CHAR: return in.readChar(); - case ARRAY: { - // VariantContext.fullyDecodeAttributes() checks for "instanceof - // List" so we have to return a List, not an array, here. - int len = in.readInt(); - final List os = new ArrayList(len); - while (len-- > 0) - os.add(decodeAttrVal(in)); - return os; - } - case STRING: { - final int len = in.readInt(); - if (len == 0) - return null; - final byte[] b = new byte[len]; - in.readFully(b); - return new String(b, "UTF-8"); - } - } - assert (false); - throw new IOException("Invalid type identifier: cannot decode"); - } + case STRING: + { + final int len = in.readInt(); + if (len == 0) { + return null; + } + final byte[] b = new byte[len]; + in.readFully(b); + return new String(b, "UTF-8"); + } + } + assert (false); + throw new IOException("Invalid type identifier: cannot decode"); + } + + // The VCF 4.1 spec says: "Integer, Float, Flag, Character, and String". But + // there can be many, so we also have ARRAY. + // + // In addition, VariantContext seems to represent some/all floats as doubles + // at least when reading from BCF, and at least BCF2FieldEncoder assumes + // them to be of class Double so we have to preserve doubles and thus must + // have DOUBLE. + private enum AttrType { + INT, + FLOAT, + BOOL, + CHAR, + STRING, + ARRAY, + DOUBLE; + + private static final AttrType[] values = values(); + + public static AttrType fromByte(byte b) { + return values[b]; + } + + public byte toByte() { + return (byte) ordinal(); + } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java b/src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java index 461585b..f67a143 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VariantContextWithHeader.java @@ -24,14 +24,15 @@ import htsjdk.variant.vcf.VCFHeader; public class VariantContextWithHeader extends VariantContext { - private final VCFHeader header; - public VariantContextWithHeader(VariantContext context, VCFHeader header) { - super(context); - this.header = header; - } + private final VCFHeader header; - public VCFHeader getHeader() { - return header; - } + public VariantContextWithHeader(VariantContext context, VCFHeader header) { + super(context); + this.header = header; + } + + public VCFHeader getHeader() { + return header; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java b/src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java index 02c921d..e48ebf6 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java +++ b/src/main/java/org/seqdoop/hadoop_bam/VariantContextWritable.java @@ -22,39 +22,50 @@ package org.seqdoop.hadoop_bam; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; - import org.apache.hadoop.io.Writable; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFHeader; -/** VariantContexts read here have LazyGenotypesContexts, which need to have a - * header set before the genotype data in the VariantContexts can be decoded. - * See the LazyGenotypesContext class. +/** + * VariantContexts read here have LazyGenotypesContexts, which need to have a header set before the + * genotype data in the VariantContexts can be decoded. See the LazyGenotypesContext class. */ public class VariantContextWritable implements Writable { - private VariantContext vc; - - public VariantContext get() { return vc; } - public void set(VariantContext vc) { this.vc = vc; } - public void set(VariantContext vc, VCFHeader header) { this.vc = new VariantContextWithHeader(vc, header); } - - // XXX: Unfortunately there's no simple way to just pass a BCF record - // through. Contrasting to BAM, there's no equivalent of the BAMRecord - // subclass of SAMRecord that saves the original BAM fields --- a - // VariantContext only saves the decoded info, so it's impossible to encode - // one to BCF without the header. - // - // VCF is also unusable because VCFWriter defensively refuses to write - // anything without a header, throwing IllegalStateException if attempted. - // - // Thus, we have a custom encoding. - @Override public void write(final DataOutput out) throws IOException { - VariantContextCodec.write(out, vc); - } - @Override public void readFields(final DataInput in) throws IOException { - vc = VariantContextCodec.read(in); - } + + private VariantContext vc; + + public VariantContext get() { + return vc; + } + + public void set(VariantContext vc) { + this.vc = vc; + } + + public void set(VariantContext vc, VCFHeader header) { + this.vc = new VariantContextWithHeader(vc, header); + } + + // XXX: Unfortunately there's no simple way to just pass a BCF record + // through. Contrasting to BAM, there's no equivalent of the BAMRecord + // subclass of SAMRecord that saves the original BAM fields --- a + // VariantContext only saves the decoded info, so it's impossible to encode + // one to BCF without the header. + // + // VCF is also unusable because VCFWriter defensively refuses to write + // anything without a header, throwing IllegalStateException if attempted. + // + // Thus, we have a custom encoding. + @Override + public void write(final DataOutput out) throws IOException { + VariantContextCodec.write(out, vc); + } + + @Override + public void readFields(final DataInput in) throws IOException { + vc = VariantContextCodec.read(in); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java index 164965a..650dae7 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndex.java @@ -25,97 +25,109 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; import java.util.NavigableSet; import java.util.TreeSet; -/** An index into BGZF-compressed files, for {@link BGZFSplitFileInputFormat}. - * Reads files that are created by {@link BGZFBlockIndexer}. +/** + * An index into BGZF-compressed files, for {@link BGZFSplitFileInputFormat}. Reads files that are + * created by {@link BGZFBlockIndexer}. * - *

Indexes the positions of individual gzip blocks in the file.

+ *

Indexes the positions of individual gzip blocks in the file. */ public final class BGZFBlockIndex { - private final NavigableSet offsets = new TreeSet(); - - public BGZFBlockIndex() {} - public BGZFBlockIndex(final File path) throws IOException { - this(new BufferedInputStream(new FileInputStream(path))); - } - public BGZFBlockIndex(final InputStream in) throws IOException { - readIndex(in); - } - - public void readIndex(final InputStream in) throws IOException { - offsets.clear(); - - final ByteBuffer bb = ByteBuffer.allocate(8); - - for (long prev = -1; in.read(bb.array(), 2, 6) == 6;) { - final long cur = bb.getLong(0); - if (prev > cur) - throw new IOException(String.format( - "Invalid BGZF block index; offsets not in order: %#x > %#x", - prev, cur)); - - offsets.add(prev = cur); - } - in.close(); - - if (offsets.size() < 1) - throw new IOException( - "Invalid BGZF block index: should contain at least the file size"); - - offsets.add(0L); - } - - public Long prevBlock(final long filePos) { - return offsets.floor(filePos); - } - public Long nextBlock(final long filePos) { - return offsets.higher(filePos); - } - - public int size() { return offsets.size(); } - - private long secondBlock() { return nextBlock(0); } - private long lastBlock() { return prevBlock(fileSize() - 1); } - private long fileSize() { return offsets.last(); } - - /** Writes some statistics about each BGZF block index file given as an - * argument. - */ - public static void main(String[] args) { - if (args.length == 0) { - System.out.println( - "Usage: BGZFBlockIndex [BGZF block indices...]\n\n"+ - - "Writes a few statistics about each BGZF block index."); - return; - } - - for (String arg : args) { - final File f = new File(arg); - if (f.isFile() && f.canRead()) { - try { - System.err.printf("%s:\n", f); - final BGZFBlockIndex bi = new BGZFBlockIndex(f); - final long second = bi.secondBlock(); - final long last = bi.lastBlock(); - System.err.printf( - "\t%d blocks\n" + - "\tfirst after 0 is at %#014x\n" + - "\tlast is at %#014x\n" + - "\tassociated BGZF file size %d\n", - bi.size()-1, - bi.secondBlock(), bi.lastBlock(), bi.fileSize()); - } catch (IOException e) { - System.err.printf("Failed to read %s!\n", f); - e.printStackTrace(); - } - } else - System.err.printf("%s does not look like a readable file!\n", f); - } - } + + private final NavigableSet offsets = new TreeSet(); + + public BGZFBlockIndex() {} + + public BGZFBlockIndex(final File path) throws IOException { + this(new BufferedInputStream(new FileInputStream(path))); + } + + public BGZFBlockIndex(final InputStream in) throws IOException { + readIndex(in); + } + + /** Writes some statistics about each BGZF block index file given as an argument. */ + public static void main(String[] args) { + if (args.length == 0) { + System.out.println( + "Usage: BGZFBlockIndex [BGZF block indices...]\n\n" + + "Writes a few statistics about each BGZF block index."); + return; + } + + for (String arg : args) { + final File f = new File(arg); + if (f.isFile() && f.canRead()) { + try { + System.err.printf("%s:\n", f); + final BGZFBlockIndex bi = new BGZFBlockIndex(f); + final long second = bi.secondBlock(); + final long last = bi.lastBlock(); + System.err.printf( + "\t%d blocks\n" + + "\tfirst after 0 is at %#014x\n" + + "\tlast is at %#014x\n" + + "\tassociated BGZF file size %d\n", + bi.size() - 1, bi.secondBlock(), bi.lastBlock(), bi.fileSize()); + } catch (IOException e) { + System.err.printf("Failed to read %s!\n", f); + e.printStackTrace(); + } + } else { + System.err.printf("%s does not look like a readable file!\n", f); + } + } + } + + public void readIndex(final InputStream in) throws IOException { + offsets.clear(); + + final ByteBuffer bb = ByteBuffer.allocate(8); + + for (long prev = -1; in.read(bb.array(), 2, 6) == 6; ) { + final long cur = bb.getLong(0); + if (prev > cur) { + throw new IOException( + String.format("Invalid BGZF block index; offsets not in order: %#x > %#x", prev, cur)); + } + + offsets.add(prev = cur); + } + in.close(); + + if (offsets.size() < 1) { + throw new IOException("Invalid BGZF block index: should contain at least the file size"); + } + + offsets.add(0L); + } + + public Long prevBlock(final long filePos) { + return offsets.floor(filePos); + } + + public Long nextBlock(final long filePos) { + return offsets.higher(filePos); + } + + public int size() { + return offsets.size(); + } + + private long secondBlock() { + return nextBlock(0); + } + + private long lastBlock() { + return prevBlock(fileSize() - 1); + } + + private long fileSize() { + return offsets.last(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndexer.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndexer.java index b7fe928..e59fb7b 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndexer.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFBlockIndexer.java @@ -26,200 +26,199 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.LongBuffer; import java.util.Arrays; -/** An indexing tool for BGZF-compressed files, making them palatable to {@link - * BGZFSplitFileInputFormat}. Writes BGZF block indices as understood by {@link - * BGZFBlockIndex}. +/** + * An indexing tool for BGZF-compressed files, making them palatable to {@link + * BGZFSplitFileInputFormat}. Writes BGZF block indices as understood by {@link BGZFBlockIndex}. */ public final class BGZFBlockIndexer { - public static void main(String[] args) { - if (args.length <= 0) { - System.out.println( - "Usage: BGZFBlockIndexer GRANULARITY [BGZF files...]\n\n"+ - - "Writes, for each GRANULARITY gzip blocks in a BGZF file, its "+ - "virtual file offset\nas a big-endian 48-bit integer into "+ - "[filename].bgzfi. The file is terminated by\nthe BGZF file's "+ - "length, in the same format."); - return; - } - - int granularity; - try { - granularity = Integer.parseInt(args[0]); - } catch (NumberFormatException e) { - granularity = 0; - } - if (granularity <= 0) { - System.err.printf( - "Granularity must be a positive integer, not '%s'!\n", args[0]); - return; - } - - final BGZFBlockIndexer indexer = new BGZFBlockIndexer(granularity); - - for (final String arg : Arrays.asList(args).subList(1, args.length)) { - final File f = new File(arg); - if (f.isFile() && f.canRead()) { - System.out.printf("Indexing %s...", f); - try { - indexer.index(f); - System.out.println(" done."); - } catch (IOException e) { - System.out.println(" FAILED!"); - e.printStackTrace(); - } - } else - System.err.printf( - "%s does not look like a file, won't index!\n", f); - } - } - - private final ByteBuffer byteBuffer; - private final int granularity; - - private int pos = 0; - - private static final int PRINT_EVERY = 500*1024*1024; - - public BGZFBlockIndexer(int g) { - granularity = g; - byteBuffer = ByteBuffer.allocate(8); // Enough to fit a long - } - - private void index(final File file) throws IOException { - final InputStream in = new FileInputStream(file); - - final OutputStream out = new BufferedOutputStream( - new FileOutputStream(file.getPath() + ".bgzfi")); - - final LongBuffer lb = - byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); - - long prevPrint = 0; - pos = 0; - - for (int i = 0;;) { - if (!skipBlock(in)) - break; - - if (++i == granularity) { - i = 0; - lb.put(0, pos); - out.write(byteBuffer.array(), 2, 6); - - if (pos - prevPrint >= PRINT_EVERY) { - System.out.print("-"); - prevPrint = pos; - } - } - } - lb.put(0, file.length()); - out.write(byteBuffer.array(), 2, 6); - out.close(); - in.close(); - } - - private boolean skipBlock(final InputStream in) throws IOException { - - // Check magic number - final int read = readBytes(in, 4); - if (read != 4) { - if (read == 0) - return false; - ioError("Invalid gzip header: too short, no ID/CM/FLG"); - } - - final int magic = byteBuffer.order(ByteOrder.BIG_ENDIAN).getInt(0); - if (magic != 0x1f8b0804) - ioError( - "Invalid gzip header: bad ID/CM/FLG %#x != 0x1f8b0804", magic); - - // Skip to extra-length - if (!readExactlyBytes(in, 8)) - ioError("Invalid gzip header: too short, no XLEN"); - - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - final int xlen = getUshort(6); - - // Skip over each subfield until finding the one we care about - for (int offset = 0; offset < xlen;) { - if (!readExactlyBytes(in, 4)) - ioError("Invalid subfields: EOF after %d subfield bytes", offset); - offset += 4; - - byteBuffer.order(ByteOrder.BIG_ENDIAN); - final int siAndSlen = byteBuffer.getInt(0); - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - - if ((siAndSlen & ~0xff) == 0x42430200) { - if (!readExactlyBytes(in, 2)) - ioError("Invalid BGZF subfield: missing BSIZE"); - offset += 2; - - final int bsize = getUshort(0); - - // Skip over: rest of header + compressed data + rest of gzip block - fullySkip(in, (xlen - offset) + (bsize - xlen - 19) + 8); - pos += bsize + 1; - return true; - } else { - final int slen = getUshort(2); - fullySkip(in, slen); - offset += slen; - } - } - throw new IOException("Invalid BGZF file: block without BGZF subfield"); - } - - private int getUshort(final int idx) { - return (int)byteBuffer.getShort(idx) & 0xffff; - } - - private void fullySkip(final InputStream in, final int skip) - throws IOException - { - // Skip repeatedly until we're either done skipping or can't skip any - // more, in case some kind of IO error is temporarily preventing it. That - // kind of situation might not necessarily be possible; the docs are - // rather vague about the whole thing. - for (int s = skip; s > 0;) { - final long skipped = in.skip(s); - if (skipped == 0) - throw new IOException("Skip failed"); - s -= skipped; - } - } - - private int readBytes(final InputStream in, final int n) - throws IOException - { - assert n <= byteBuffer.capacity(); - - int read = 0; - while (read < n) { - final int readNow = in.read(byteBuffer.array(), read, n - read); - if (readNow <= 0) - break; - read += readNow; - } - return read; - } - private boolean readExactlyBytes(final InputStream in, final int n) - throws IOException - { - return readBytes(in, n) == n; - } - - private void ioError(String s, Object... va) throws IOException { - throw new IOException(String.format(s, va)); - } + + private static final int PRINT_EVERY = 500 * 1024 * 1024; + private final ByteBuffer byteBuffer; + private final int granularity; + + private int pos = 0; + + public BGZFBlockIndexer(int g) { + granularity = g; + byteBuffer = ByteBuffer.allocate(8); // Enough to fit a long + } + + public static void main(String[] args) { + if (args.length <= 0) { + System.out.println( + "Usage: BGZFBlockIndexer GRANULARITY [BGZF files...]\n\n" + + "Writes, for each GRANULARITY gzip blocks in a BGZF file, its " + + "virtual file offset\nas a big-endian 48-bit integer into " + + "[filename].bgzfi. The file is terminated by\nthe BGZF file's " + + "length, in the same format."); + return; + } + + int granularity; + try { + granularity = Integer.parseInt(args[0]); + } catch (NumberFormatException e) { + granularity = 0; + } + if (granularity <= 0) { + System.err.printf("Granularity must be a positive integer, not '%s'!\n", args[0]); + return; + } + + final BGZFBlockIndexer indexer = new BGZFBlockIndexer(granularity); + + for (final String arg : Arrays.asList(args).subList(1, args.length)) { + final File f = new File(arg); + if (f.isFile() && f.canRead()) { + System.out.printf("Indexing %s...", f); + try { + indexer.index(f); + System.out.println(" done."); + } catch (IOException e) { + System.out.println(" FAILED!"); + e.printStackTrace(); + } + } else { + System.err.printf("%s does not look like a file, won't index!\n", f); + } + } + } + + private void index(final File file) throws IOException { + final InputStream in = new FileInputStream(file); + + final OutputStream out = + new BufferedOutputStream(new FileOutputStream(file.getPath() + ".bgzfi")); + + final LongBuffer lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); + + long prevPrint = 0; + pos = 0; + + for (int i = 0; ; ) { + if (!skipBlock(in)) { + break; + } + + if (++i == granularity) { + i = 0; + lb.put(0, pos); + out.write(byteBuffer.array(), 2, 6); + + if (pos - prevPrint >= PRINT_EVERY) { + System.out.print("-"); + prevPrint = pos; + } + } + } + lb.put(0, file.length()); + out.write(byteBuffer.array(), 2, 6); + out.close(); + in.close(); + } + + private boolean skipBlock(final InputStream in) throws IOException { + + // Check magic number + final int read = readBytes(in, 4); + if (read != 4) { + if (read == 0) { + return false; + } + ioError("Invalid gzip header: too short, no ID/CM/FLG"); + } + + final int magic = byteBuffer.order(ByteOrder.BIG_ENDIAN).getInt(0); + if (magic != 0x1f8b0804) { + ioError("Invalid gzip header: bad ID/CM/FLG %#x != 0x1f8b0804", magic); + } + + // Skip to extra-length + if (!readExactlyBytes(in, 8)) { + ioError("Invalid gzip header: too short, no XLEN"); + } + + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + + final int xlen = getUshort(6); + + // Skip over each subfield until finding the one we care about + for (int offset = 0; offset < xlen; ) { + if (!readExactlyBytes(in, 4)) { + ioError("Invalid subfields: EOF after %d subfield bytes", offset); + } + offset += 4; + + byteBuffer.order(ByteOrder.BIG_ENDIAN); + final int siAndSlen = byteBuffer.getInt(0); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + + if ((siAndSlen & ~0xff) == 0x42430200) { + if (!readExactlyBytes(in, 2)) { + ioError("Invalid BGZF subfield: missing BSIZE"); + } + offset += 2; + + final int bsize = getUshort(0); + + // Skip over: rest of header + compressed data + rest of gzip block + fullySkip(in, (xlen - offset) + (bsize - xlen - 19) + 8); + pos += bsize + 1; + return true; + } else { + final int slen = getUshort(2); + fullySkip(in, slen); + offset += slen; + } + } + throw new IOException("Invalid BGZF file: block without BGZF subfield"); + } + + private int getUshort(final int idx) { + return (int) byteBuffer.getShort(idx) & 0xffff; + } + + private void fullySkip(final InputStream in, final int skip) throws IOException { + // Skip repeatedly until we're either done skipping or can't skip any + // more, in case some kind of IO error is temporarily preventing it. That + // kind of situation might not necessarily be possible; the docs are + // rather vague about the whole thing. + for (int s = skip; s > 0; ) { + final long skipped = in.skip(s); + if (skipped == 0) { + throw new IOException("Skip failed"); + } + s -= skipped; + } + } + + private int readBytes(final InputStream in, final int n) throws IOException { + assert n <= byteBuffer.capacity(); + + int read = 0; + while (read < n) { + final int readNow = in.read(byteBuffer.array(), read, n - read); + if (readNow <= 0) { + break; + } + read += readNow; + } + return read; + } + + private boolean readExactlyBytes(final InputStream in, final int n) throws IOException { + return readBytes(in, n) == n; + } + + private void ioError(String s, Object... va) throws IOException { + throw new IOException(String.format(s, va)); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java index 669e2ac..c245205 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCodec.java @@ -13,17 +13,15 @@ import org.apache.hadoop.io.compress.SplittableCompressionCodec; /** - * A Hadoop {@link CompressionCodec} for the - * BGZF compression format, - * which reads and writes files with a .bgz suffix. There is no standard - * suffix for BGZF-compressed files, and in fact .gz is commonly used, in - * which case {@link BGZFEnhancedGzipCodec} should be used instead of this class. - *

- * To use BGZFCodec, set it on the configuration object as follows. - *

- * {@code - * conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()) - * } + * A Hadoop {@link CompressionCodec} for the BGZF compression format, which reads + * and writes files with a .bgz suffix. There is no standard suffix for BGZF-compressed + * files, and in fact .gz is commonly used, in which case {@link BGZFEnhancedGzipCodec} + * should be used instead of this class. + * + *

To use BGZFCodec, set it on the configuration object as follows. {@code + * conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()) } + * * @see BGZFEnhancedGzipCodec */ public class BGZFCodec extends GzipCodec implements SplittableCompressionCodec { @@ -38,8 +36,8 @@ public CompressionOutputStream createOutputStream(OutputStream out) throws IOExc // compressors are not used, so ignore/return null @Override - public CompressionOutputStream createOutputStream(OutputStream out, - Compressor compressor) throws IOException { + public CompressionOutputStream createOutputStream(OutputStream out, Compressor compressor) + throws IOException { return createOutputStream(out); // compressors are not used, so ignore } @@ -54,11 +52,12 @@ public Compressor createCompressor() { } @Override - public SplitCompressionInputStream createInputStream(InputStream seekableIn, - Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException { + public SplitCompressionInputStream createInputStream( + InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) + throws IOException { BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn); long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end); - ((Seekable)seekableIn).seek(adjustedStart); + ((Seekable) seekableIn).seek(adjustedStart); return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end); } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java index 639dbec..b4a0862 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFCompressionOutputStream.java @@ -6,19 +6,17 @@ import org.apache.hadoop.io.compress.CompressionOutputStream; /** - * An implementation of {@code CompressionOutputStream} for BGZF, using - * {@link BlockCompressedOutputStream} from htsjdk. Note that unlike - * {@link BlockCompressedOutputStream}, an empty gzip block file terminator is - * not written at the end of the stream. This is because in Hadoop, multiple - * headerless files are often written in parallel, and merged afterwards into a single - * file, and it's during the merge process the header and terminator are added. + * An implementation of {@code CompressionOutputStream} for BGZF, using {@link + * BlockCompressedOutputStream} from htsjdk. Note that unlike {@link BlockCompressedOutputStream}, + * an empty gzip block file terminator is not written at the end of the stream. This is + * because in Hadoop, multiple headerless files are often written in parallel, and merged afterwards + * into a single file, and it's during the merge process the header and terminator are added. */ class BGZFCompressionOutputStream extends CompressionOutputStream { private BlockCompressedOutputStream output; - public BGZFCompressionOutputStream(OutputStream out) - throws IOException { + public BGZFCompressionOutputStream(OutputStream out) throws IOException { super(out); this.output = new BlockCompressedOutputStream(out, null); } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java index 04112a7..cefd553 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFEnhancedGzipCodec.java @@ -13,53 +13,51 @@ import org.apache.hadoop.io.compress.SplittableCompressionCodec; /** - * A Hadoop {@link CompressionCodec} for the - * BGZF compression format, - * which reads and writes files with a .gz suffix. - *

- * BGZF is a splittable extension of gzip, which means that all BGZF files are standard - * gzip files, however the reverse is not necessarily the case. BGZF files often have the - * standard .gz suffix (such as those produced by the - * bcftools command), - * which causes a difficulty since it is not immediately apparent from the filename alone - * whether a file is a BGZF file, or merely a regular gzip file. BGZFEnhancedGzipCodec - * will read the start of the file to look for BGZF headers to detect the type of - * compression. - *

- *

- * BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip files. - *

- *

- * To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will - * override the built-in GzipCodec that is mapped to the .gz suffix. - *

- * {@code - * conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName()) - * } + * A Hadoop {@link CompressionCodec} for the BGZF compression format, which reads + * and writes files with a .gz suffix. + * + *

BGZF is a splittable extension of gzip, which means that all BGZF files are standard gzip + * files, however the reverse is not necessarily the case. BGZF files often have the standard + * .gz suffix (such as those produced by the bcftools command), which causes a + * difficulty since it is not immediately apparent from the filename alone whether a file is a BGZF + * file, or merely a regular gzip file. BGZFEnhancedGzipCodec will read the start of the file to + * look for BGZF headers to detect the type of compression. + * + *

BGZFEnhancedGzipCodec will read BGZF or gzip files, but currently always writes regular gzip + * files. + * + *

To use BGZFEnhancedGzipCodec, set it on the configuration object as follows. This will + * override the built-in GzipCodec that is mapped to the .gz suffix. {@code + * conf.set("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName()) } + * * @see BGZFCodec */ public class BGZFEnhancedGzipCodec extends GzipCodec implements SplittableCompressionCodec { @Override - public SplitCompressionInputStream createInputStream(InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) throws IOException { + public SplitCompressionInputStream createInputStream( + InputStream seekableIn, Decompressor decompressor, long start, long end, READ_MODE readMode) + throws IOException { if (!(seekableIn instanceof Seekable)) { - throw new IOException("seekableIn must be an instance of " + - Seekable.class.getName()); + throw new IOException("seekableIn must be an instance of " + Seekable.class.getName()); } if (!BlockCompressedInputStream.isValidFile(new BufferedInputStream(seekableIn))) { // data is regular gzip, not BGZF - ((Seekable)seekableIn).seek(0); - final CompressionInputStream compressionInputStream = createInputStream(seekableIn, - decompressor); + ((Seekable) seekableIn).seek(0); + final CompressionInputStream compressionInputStream = + createInputStream(seekableIn, decompressor); return new SplitCompressionInputStream(compressionInputStream, start, end) { @Override public int read(byte[] b, int off, int len) throws IOException { return compressionInputStream.read(b, off, len); } + @Override public void resetState() throws IOException { compressionInputStream.resetState(); } + @Override public int read() throws IOException { return compressionInputStream.read(); @@ -68,8 +66,7 @@ public int read() throws IOException { } BGZFSplitGuesser splitGuesser = new BGZFSplitGuesser(seekableIn); long adjustedStart = splitGuesser.guessNextBGZFBlockStart(start, end); - ((Seekable)seekableIn).seek(adjustedStart); + ((Seekable) seekableIn).seek(adjustedStart); return new BGZFSplitCompressionInputStream(seekableIn, adjustedStart, end); } - } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java index 8786757..a200689 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitCompressionInputStream.java @@ -7,26 +7,22 @@ import org.apache.hadoop.io.compress.SplitCompressionInputStream; /** - * An implementation of {@code SplitCompressionInputStream} for BGZF, based on - * {@code BZip2CompressionInputStream} and {@code CBZip2InputStream} from Hadoop. - * (BZip2 is the only splittable compression codec in Hadoop.) + * An implementation of {@code SplitCompressionInputStream} for BGZF, based on {@code + * BZip2CompressionInputStream} and {@code CBZip2InputStream} from Hadoop. (BZip2 is the only + * splittable compression codec in Hadoop.) */ class BGZFSplitCompressionInputStream extends SplitCompressionInputStream { + private static final int END_OF_BLOCK = -2; private final BlockCompressedInputStream input; - private BufferedInputStream bufferedIn; - private long startingPos = 0L; - private long processedPosition; - - private enum POS_ADVERTISEMENT_STATE_MACHINE { - HOLD, ADVERTISE - }; - POS_ADVERTISEMENT_STATE_MACHINE posSM = POS_ADVERTISEMENT_STATE_MACHINE.HOLD; long compressedStreamPosition = 0; + private BufferedInputStream bufferedIn; + private long startingPos = 0L;; + + private long processedPosition; - public BGZFSplitCompressionInputStream(InputStream in, long start, long end) - throws IOException { + public BGZFSplitCompressionInputStream(InputStream in, long start, long end) throws IOException { super(in, start, end); bufferedIn = new BufferedInputStream(super.in); this.startingPos = super.getPos(); @@ -61,11 +57,12 @@ public int read(byte[] b, int off, int len) throws IOException { /** * Read up to len bytes from the stream, but no further than the end of the - * compressed block. If at the end of the block then no bytes will be read and a return - * value of -2 will be returned; on the next call to read, bytes from the next block - * will be returned. This is the same contract as CBZip2InputStream in Hadoop. - * @return int The return value greater than 0 are the bytes read. A value - * of -1 means end of stream while -2 represents end of block. + * compressed block. If at the end of the block then no bytes will be read and a return value of + * -2 will be returned; on the next call to read, bytes from the next block will be returned. This + * is the same contract as CBZip2InputStream in Hadoop. + * + * @return int The return value greater than 0 are the bytes read. A value of -1 means end of + * stream while -2 represents end of block. */ private int readWithinBlock(byte[] b, int off, int len) throws IOException { if (input.endOfBlock()) { @@ -102,4 +99,9 @@ private void updatePos(boolean shouldAddOn) { public void close() throws IOException { input.close(); } + + private enum POS_ADVERTISEMENT_STATE_MACHINE { + HOLD, + ADVERTISE + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitFileInputFormat.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitFileInputFormat.java index 09eedbb..f78461e 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitFileInputFormat.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitFileInputFormat.java @@ -20,14 +20,11 @@ package org.seqdoop.hadoop_bam.util; - - import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; @@ -36,124 +33,120 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; -/** An {@link org.apache.hadoop.mapreduce.InputFormat} for BGZF-compressed - * files. +/** + * An {@link org.apache.hadoop.mapreduce.InputFormat} for BGZF-compressed files. * - *

A {@link BGZFBlockIndex} for each Path used is required, or an - * IOException is thrown out of {@link #getSplits}.

+ *

A {@link BGZFBlockIndex} for each Path used is required, or an IOException is + * thrown out of {@link #getSplits}. */ -public abstract class BGZFSplitFileInputFormat - extends FileInputFormat -{ - private Path getIdxPath(Path path) { return path.suffix(".bgzfi"); } - - /** The splits returned are FileSplits. */ - @Override public List getSplits(JobContext job) - throws IOException - { - final List splits = super.getSplits(job); - - // Align the splits so that they don't cross blocks - - // addIndexedSplits() requires the given splits to be sorted by file - // path, so do so. Although FileInputFormat.getSplits() does, at the time - // of writing this, generate them in that order, we shouldn't rely on it. - Collections.sort(splits, new Comparator() { - public int compare(InputSplit a, InputSplit b) { - FileSplit fa = (FileSplit)a, fb = (FileSplit)b; - return fa.getPath().compareTo(fb.getPath()); - } - }); - - final List newSplits = - new ArrayList(splits.size()); - - final Configuration cfg = job.getConfiguration(); - - for (int i = 0; i < splits.size();) { - try { - i = addIndexedSplits (splits, i, newSplits, cfg); - } catch (IOException e) { - i = addProbabilisticSplits(splits, i, newSplits, cfg); - } - } - return newSplits; - } - - // Handles all the splits that share the Path of the one at index i, - // returning the next index to be used. - private int addIndexedSplits( - List splits, int i, List newSplits, - Configuration cfg) - throws IOException - { - final Path file = ((FileSplit)splits.get(i)).getPath(); - - final BGZFBlockIndex idx = new BGZFBlockIndex( - file.getFileSystem(cfg).open(getIdxPath(file))); - - int splitsEnd = splits.size(); - for (int j = i; j < splitsEnd; ++j) - if (!file.equals(((FileSplit)splits.get(j)).getPath())) - splitsEnd = j; - - for (int j = i; j < splitsEnd; ++j) { - final FileSplit fileSplit = (FileSplit)splits.get(j); - - final long start = fileSplit.getStart(); - final long end = start + fileSplit.getLength(); - - final Long blockStart = idx.prevBlock(start); - final Long blockEnd = j == splitsEnd-1 ? idx.prevBlock(end) - : idx.nextBlock(end); - - if (blockStart == null) - throw new RuntimeException( - "Internal error or invalid index: no block start for " +start); - - if (blockEnd == null) - throw new RuntimeException( - "Internal error or invalid index: no block end for " +end); - - newSplits.add(new FileSplit( - file, blockStart, blockEnd - blockStart, - fileSplit.getLocations())); - } - return splitsEnd; - } - - // Works the same way as addIndexedSplits, to avoid having to reopen the - // file repeatedly and checking addIndexedSplits for an index repeatedly. - private int addProbabilisticSplits( - List splits, int i, List newSplits, - Configuration cfg) - throws IOException - { - final Path path = ((FileSplit)splits.get(i)).getPath(); - final FSDataInputStream in = path.getFileSystem(cfg).open(path); - - final BGZFSplitGuesser guesser = new BGZFSplitGuesser(in); - - FileSplit fspl; - do { - fspl = (FileSplit)splits.get(i); - - final long beg = fspl.getStart(); - final long end = beg + fspl.getLength(); - - final long alignedBeg = guesser.guessNextBGZFBlockStart(beg, end); - - newSplits.add(new FileSplit( - path, alignedBeg, end - alignedBeg, fspl.getLocations())); - - ++i; - } while (i < splits.size() && fspl.getPath().equals(path)); - - in.close(); - return i; - } - - @Override public boolean isSplitable(JobContext job, Path path) { - return true; - } +public abstract class BGZFSplitFileInputFormat extends FileInputFormat { + + private Path getIdxPath(Path path) { + return path.suffix(".bgzfi"); + } + + /** The splits returned are FileSplits. */ + @Override + public List getSplits(JobContext job) throws IOException { + final List splits = super.getSplits(job); + + // Align the splits so that they don't cross blocks + + // addIndexedSplits() requires the given splits to be sorted by file + // path, so do so. Although FileInputFormat.getSplits() does, at the time + // of writing this, generate them in that order, we shouldn't rely on it. + Collections.sort( + splits, + new Comparator() { + public int compare(InputSplit a, InputSplit b) { + FileSplit fa = (FileSplit) a, fb = (FileSplit) b; + return fa.getPath().compareTo(fb.getPath()); + } + }); + + final List newSplits = new ArrayList(splits.size()); + + final Configuration cfg = job.getConfiguration(); + + for (int i = 0; i < splits.size(); ) { + try { + i = addIndexedSplits(splits, i, newSplits, cfg); + } catch (IOException e) { + i = addProbabilisticSplits(splits, i, newSplits, cfg); + } + } + return newSplits; + } + + // Handles all the splits that share the Path of the one at index i, + // returning the next index to be used. + private int addIndexedSplits( + List splits, int i, List newSplits, Configuration cfg) + throws IOException { + final Path file = ((FileSplit) splits.get(i)).getPath(); + + final BGZFBlockIndex idx = new BGZFBlockIndex(file.getFileSystem(cfg).open(getIdxPath(file))); + + int splitsEnd = splits.size(); + for (int j = i; j < splitsEnd; ++j) { + if (!file.equals(((FileSplit) splits.get(j)).getPath())) { + splitsEnd = j; + } + } + + for (int j = i; j < splitsEnd; ++j) { + final FileSplit fileSplit = (FileSplit) splits.get(j); + + final long start = fileSplit.getStart(); + final long end = start + fileSplit.getLength(); + + final Long blockStart = idx.prevBlock(start); + final Long blockEnd = j == splitsEnd - 1 ? idx.prevBlock(end) : idx.nextBlock(end); + + if (blockStart == null) { + throw new RuntimeException("Internal error or invalid index: no block start for " + start); + } + + if (blockEnd == null) { + throw new RuntimeException("Internal error or invalid index: no block end for " + end); + } + + newSplits.add( + new FileSplit(file, blockStart, blockEnd - blockStart, fileSplit.getLocations())); + } + return splitsEnd; + } + + // Works the same way as addIndexedSplits, to avoid having to reopen the + // file repeatedly and checking addIndexedSplits for an index repeatedly. + private int addProbabilisticSplits( + List splits, int i, List newSplits, Configuration cfg) + throws IOException { + final Path path = ((FileSplit) splits.get(i)).getPath(); + final FSDataInputStream in = path.getFileSystem(cfg).open(path); + + final BGZFSplitGuesser guesser = new BGZFSplitGuesser(in); + + FileSplit fspl; + do { + fspl = (FileSplit) splits.get(i); + + final long beg = fspl.getStart(); + final long end = beg + fspl.getLength(); + + final long alignedBeg = guesser.guessNextBGZFBlockStart(beg, end); + + newSplits.add(new FileSplit(path, alignedBeg, end - alignedBeg, fspl.getLocations())); + + ++i; + } while (i < splits.size() && fspl.getPath().equals(path)); + + in.close(); + return i; + } + + @Override + public boolean isSplitable(JobContext job, Path path) { + return true; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitGuesser.java b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitGuesser.java index 9835ff5..d2aad3e 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitGuesser.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/BGZFSplitGuesser.java @@ -23,151 +23,149 @@ package org.seqdoop.hadoop_bam.util; import htsjdk.samtools.seekablestream.ByteArraySeekableStream; +import htsjdk.samtools.util.BlockCompressedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; - -import htsjdk.samtools.util.BlockCompressedInputStream; - import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Seekable; public class BGZFSplitGuesser { - private InputStream inFile; - private Seekable seekableInFile; - private ByteArraySeekableStream in; - private final ByteBuffer buf; - - private final static int BGZF_MAGIC = 0x04088b1f; - private final static int BGZF_MAGIC_SUB = 0x00024342; - private final static int BGZF_SUB_SIZE = 4 + 2; - - public BGZFSplitGuesser(InputStream is) { - inFile = is; - seekableInFile = (Seekable) is; - - buf = ByteBuffer.allocate(8); - buf.order(ByteOrder.LITTLE_ENDIAN); - } - - public BGZFSplitGuesser(FSDataInputStream is) { - inFile = is; - seekableInFile = is; - - buf = ByteBuffer.allocate(8); - buf.order(ByteOrder.LITTLE_ENDIAN); - } - - /// Looks in the range [beg,end). Returns end if no BAM record was found. - public long guessNextBGZFBlockStart(long beg, long end) - throws IOException - { - // Buffer what we need to go through. Since the max size of a BGZF block - // is 0xffff (64K), and we might be just one byte off from the start of - // the previous one, we need 0xfffe bytes for the start, and then 0xffff - // for the block we're looking for. - - byte[] arr = new byte[2*0xffff - 1]; - - this.seekableInFile.seek(beg); - int totalRead = 0; - for (int left = Math.min((int)(end - beg), arr.length); left > 0;) { - final int r = inFile.read(arr, totalRead, left); - if (r < 0) - break; - totalRead += r; - left -= r; - } - arr = Arrays.copyOf(arr, totalRead); - - this.in = new ByteArraySeekableStream(arr); - - final BlockCompressedInputStream bgzf = - new BlockCompressedInputStream(this.in); - bgzf.setCheckCrcs(true); - - final int firstBGZFEnd = Math.min((int)(end - beg), 0xffff); - - for (int pos = 0;;) { - pos = guessNextBGZFPos(pos, firstBGZFEnd); - if (pos < 0) - return end; - - try { - // Seek in order to trigger decompression of the block and a CRC - // check. - bgzf.seek((long)pos << 16); - - // This has to catch Throwable, because it's possible to get an - // OutOfMemoryError due to an overly large size. - } catch (Throwable e) { - // Guessed BGZF position incorrectly: try the next guess. - ++pos; - continue; - } - return beg + pos; - } - } - - // Returns a negative number if it doesn't find anything. - private int guessNextBGZFPos(int p, int end) - throws IOException - { - for (;;) { - for (;;) { - in.seek(p); - in.read(buf.array(), 0, 4); - int n = buf.getInt(0); - - if (n == BGZF_MAGIC) - break; - - // Skip ahead a bit more than 1 byte if you can. - if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) - ++p; - else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) - p += 2; - else - p += 3; - - if (p >= end) - return -1; - } - // Found what looks like a gzip block header: now get XLEN and - // search for the BGZF subfield. - final int p0 = p; - p += 10; - in.seek(p); - in.read(buf.array(), 0, 2); - p += 2; - final int xlen = getUShort(0); - final int subEnd = p + xlen; - - while (p < subEnd) { - in.read(buf.array(), 0, 4); - - if (buf.getInt(0) != BGZF_MAGIC_SUB) { - p += 4 + getUShort(2); - in.seek(p); - continue; - } - - // Found it: this is close enough to a BGZF block, make it - // our guess. - return p0; - } - // No luck: look for the next gzip block header. Start right after - // where we last saw the identifiers, although we could probably - // safely skip further ahead. (If we find the correct one right - // now, the previous block contained 0x1f8b0804 bytes of data: that - // seems... unlikely.) - p = p0 + 4; - } - } - - private int getUShort(final int idx) { - return (int)buf.getShort(idx) & 0xffff; - } + + private static final int BGZF_MAGIC = 0x04088b1f; + private static final int BGZF_MAGIC_SUB = 0x00024342; + private static final int BGZF_SUB_SIZE = 4 + 2; + private final ByteBuffer buf; + private InputStream inFile; + private Seekable seekableInFile; + private ByteArraySeekableStream in; + + public BGZFSplitGuesser(InputStream is) { + inFile = is; + seekableInFile = (Seekable) is; + + buf = ByteBuffer.allocate(8); + buf.order(ByteOrder.LITTLE_ENDIAN); + } + + public BGZFSplitGuesser(FSDataInputStream is) { + inFile = is; + seekableInFile = is; + + buf = ByteBuffer.allocate(8); + buf.order(ByteOrder.LITTLE_ENDIAN); + } + + /// Looks in the range [beg,end). Returns end if no BAM record was found. + public long guessNextBGZFBlockStart(long beg, long end) throws IOException { + // Buffer what we need to go through. Since the max size of a BGZF block + // is 0xffff (64K), and we might be just one byte off from the start of + // the previous one, we need 0xfffe bytes for the start, and then 0xffff + // for the block we're looking for. + + byte[] arr = new byte[2 * 0xffff - 1]; + + this.seekableInFile.seek(beg); + int totalRead = 0; + for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { + final int r = inFile.read(arr, totalRead, left); + if (r < 0) { + break; + } + totalRead += r; + left -= r; + } + arr = Arrays.copyOf(arr, totalRead); + + this.in = new ByteArraySeekableStream(arr); + + final BlockCompressedInputStream bgzf = new BlockCompressedInputStream(this.in); + bgzf.setCheckCrcs(true); + + final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); + + for (int pos = 0; ; ) { + pos = guessNextBGZFPos(pos, firstBGZFEnd); + if (pos < 0) { + return end; + } + + try { + // Seek in order to trigger decompression of the block and a CRC + // check. + bgzf.seek((long) pos << 16); + + // This has to catch Throwable, because it's possible to get an + // OutOfMemoryError due to an overly large size. + } catch (Throwable e) { + // Guessed BGZF position incorrectly: try the next guess. + ++pos; + continue; + } + return beg + pos; + } + } + + // Returns a negative number if it doesn't find anything. + private int guessNextBGZFPos(int p, int end) throws IOException { + for (; ; ) { + for (; ; ) { + in.seek(p); + in.read(buf.array(), 0, 4); + int n = buf.getInt(0); + + if (n == BGZF_MAGIC) { + break; + } + + // Skip ahead a bit more than 1 byte if you can. + if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) { + ++p; + } else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) { + p += 2; + } else { + p += 3; + } + + if (p >= end) { + return -1; + } + } + // Found what looks like a gzip block header: now get XLEN and + // search for the BGZF subfield. + final int p0 = p; + p += 10; + in.seek(p); + in.read(buf.array(), 0, 2); + p += 2; + final int xlen = getUShort(0); + final int subEnd = p + xlen; + + while (p < subEnd) { + in.read(buf.array(), 0, 4); + + if (buf.getInt(0) != BGZF_MAGIC_SUB) { + p += 4 + getUShort(2); + in.seek(p); + continue; + } + + // Found it: this is close enough to a BGZF block, make it + // our guess. + return p0; + } + // No luck: look for the next gzip block header. Start right after + // where we last saw the identifiers, although we could probably + // safely skip further ahead. (If we find the correct one right + // now, the previous block contained 0x1f8b0804 bytes of data: that + // seems... unlikely.) + p = p0 + 4; + } + } + + private int getUShort(final int idx) { + return (int) buf.getShort(idx) & 0xffff; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java b/src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java index 1fb3be8..c3343e7 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/ConfHelper.java @@ -24,47 +24,46 @@ import org.apache.hadoop.conf.Configuration; -public class ConfHelper -{ - /** - * Convert a string to a boolean. - * - * Accepted values: "yes", "true", "t", "y", "1" - * "no", "false", "f", "n", "0" - * All comparisons are case insensitive. - * - * If the value provided is null, defaultValue is returned. - * - * @exception IllegalArgumentException Thrown if value is not - * null and doesn't match any of the accepted strings. - */ - public static boolean parseBoolean(String value, boolean defaultValue) - { - if (value == null) - return defaultValue; +public class ConfHelper { - value = value.trim(); + /** + * Convert a string to a boolean. + * + *

Accepted values: "yes", "true", "t", "y", "1" "no", "false", "f", "n", "0" All comparisons + * are case insensitive. + * + *

If the value provided is null, defaultValue is returned. + * + * @throws IllegalArgumentException Thrown if value is not null and doesn't match any of the + * accepted strings. + */ + public static boolean parseBoolean(String value, boolean defaultValue) { + if (value == null) { + return defaultValue; + } - // any of the following will - final String[] acceptedTrue = new String[]{ "yes", "true", "t", "y", "1" }; - final String[] acceptedFalse = new String[]{ "no", "false", "f", "n", "0" }; + value = value.trim(); - for (String possible: acceptedTrue) - { - if (possible.equalsIgnoreCase(value)) - return true; - } - for (String possible: acceptedFalse) - { - if (possible.equalsIgnoreCase(value)) - return false; - } + // any of the following will + final String[] acceptedTrue = new String[] {"yes", "true", "t", "y", "1"}; + final String[] acceptedFalse = new String[] {"no", "false", "f", "n", "0"}; - throw new IllegalArgumentException("Unrecognized boolean value '" + value + "'"); - } + for (String possible : acceptedTrue) { + if (possible.equalsIgnoreCase(value)) { + return true; + } + } + for (String possible : acceptedFalse) { + if (possible.equalsIgnoreCase(value)) { + return false; + } + } - public static boolean parseBoolean(Configuration conf, String propertyName, boolean defaultValue) - { - return parseBoolean(conf.get(propertyName), defaultValue); - } + throw new IllegalArgumentException("Unrecognized boolean value '" + value + "'"); + } + + public static boolean parseBoolean( + Configuration conf, String propertyName, boolean defaultValue) { + return parseBoolean(conf.get(propertyName), defaultValue); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java b/src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java index 915f533..7d4e8e6 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/DataInputWrapper.java @@ -23,27 +23,36 @@ package org.seqdoop.hadoop_bam.util; import java.io.DataInput; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; public class DataInputWrapper extends InputStream { - private final DataInput in; - - public DataInputWrapper(DataInput i) { in = i; } - - @Override public long skip(long n) throws IOException { - for (; n > Integer.MAX_VALUE; n -= Integer.MAX_VALUE) { - final int skipped = in.skipBytes(Integer.MAX_VALUE); - if (skipped < Integer.MAX_VALUE) - return skipped; - } - return in.skipBytes((int)n); - } - @Override public int read(byte[] b, int off, int len) throws IOException { - in.readFully(b, off, len); - return len; - } - @Override public int read() throws IOException { - return in.readByte(); - } + + private final DataInput in; + + public DataInputWrapper(DataInput i) { + in = i; + } + + @Override + public long skip(long n) throws IOException { + for (; n > Integer.MAX_VALUE; n -= Integer.MAX_VALUE) { + final int skipped = in.skipBytes(Integer.MAX_VALUE); + if (skipped < Integer.MAX_VALUE) { + return skipped; + } + } + return in.skipBytes((int) n); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + in.readFully(b, off, len); + return len; + } + + @Override + public int read() throws IOException { + return in.readByte(); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java b/src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java index 482623f..2603d1f 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/DataOutputWrapper.java @@ -27,14 +27,20 @@ import java.io.OutputStream; public class DataOutputWrapper extends OutputStream { - private final DataOutput out; - public DataOutputWrapper(DataOutput o) { out = o; } + private final DataOutput out; - @Override public void write(int b) throws IOException { - out.writeByte(b); - } - @Override public void write(byte[] b, int off, int len) throws IOException { - out.write(b, off, len); - } + public DataOutputWrapper(DataOutput o) { + out = o; + } + + @Override + public void write(int b) throws IOException { + out.writeByte(b); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java b/src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java index a6083e2..bd1c076 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/GetSortedBAMHeader.java @@ -22,36 +22,35 @@ package org.seqdoop.hadoop_bam.util; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; - import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; - +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; import org.seqdoop.hadoop_bam.SAMFormat; public final class GetSortedBAMHeader { - public static void main(String[] args) throws IOException { - if (args.length < 2) { - System.err.println( - "Usage: GetSortedBAMHeader input output\n\n"+ - - "Reads the BAM header from input (a standard BGZF-compressed BAM "+ - "file), and\nwrites it (BGZF-compressed, no terminator block) to "+ - "output. Sets the sort order\nindicated in the SAM header to "+ - "'coordinate'."); - System.exit(1); - } - - final SAMFileHeader h = - SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT) - .setUseAsyncIo(false) - .open(new File(args[0])).getFileHeader(); - h.setSortOrder(SAMFileHeader.SortOrder.coordinate); - new SAMOutputPreparer().prepareForRecords( - new FileOutputStream(args[1]), SAMFormat.BAM, h); - } + public static void main(String[] args) throws IOException { + if (args.length < 2) { + System.err.println( + "Usage: GetSortedBAMHeader input output\n\n" + + "Reads the BAM header from input (a standard BGZF-compressed BAM " + + "file), and\nwrites it (BGZF-compressed, no terminator block) to " + + "output. Sets the sort order\nindicated in the SAM header to " + + "'coordinate'."); + System.exit(1); + } + + final SAMFileHeader h = + SamReaderFactory.makeDefault() + .validationStringency(ValidationStringency.SILENT) + .setUseAsyncIo(false) + .open(new File(args[0])) + .getFileHeader(); + h.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + new SAMOutputPreparer().prepareForRecords(new FileOutputStream(args[1]), SAMFormat.BAM, h); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java b/src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java index 3229e47..acf9e9a 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/IntervalUtil.java @@ -2,61 +2,61 @@ import com.google.common.collect.ImmutableList; import htsjdk.samtools.util.Interval; -import org.apache.hadoop.conf.Configuration; -import org.seqdoop.hadoop_bam.FormatException; - import java.util.ArrayList; import java.util.List; -import java.util.function.Supplier; +import org.apache.hadoop.conf.Configuration; +import org.seqdoop.hadoop_bam.FormatException; -/** - * Common utilities across different file formats. - */ +/** Common utilities across different file formats. */ public final class IntervalUtil { - // declared to prevent instantiation. - private IntervalUtil() {} + // declared to prevent instantiation. + private IntervalUtil() {} - /** - * Returns the list of intervals found in a string configuration property separated by colons. - * @param conf the source configuration. - * @param intervalPropertyName the property name holding the intervals. - * @return {@code null} if there is no such a property in the configuration. - * @throws NullPointerException if either input is null. - */ - public static List getIntervals(final Configuration conf, final String intervalPropertyName) { - final String intervalsProperty = conf.get(intervalPropertyName); - if (intervalsProperty == null) { - return null; - } - if (intervalsProperty.isEmpty()) { - return ImmutableList.of(); - } - final List intervals = new ArrayList<>(); - for (final String s : intervalsProperty.split(",")) { - final int lastColonIdx = s.lastIndexOf(':'); - if (lastColonIdx < 0) { - throw new FormatException("no colon found in interval string: " + s); - } - final int hyphenIdx = s.indexOf('-', lastColonIdx + 1); - if (hyphenIdx < 0) { - throw new FormatException("no hyphen found after colon interval string: " + s); - } - final String sequence = s.substring(0, lastColonIdx); - final int start = parseIntOrThrowFormatException(s.substring(lastColonIdx + 1, hyphenIdx), - "invalid start position", s); - final int stop = parseIntOrThrowFormatException(s.substring(hyphenIdx + 1), - "invalid stop position", s); - intervals.add(new Interval(sequence, start, stop)); - } - return intervals; + /** + * Returns the list of intervals found in a string configuration property separated by colons. + * + * @param conf the source configuration. + * @param intervalPropertyName the property name holding the intervals. + * @return {@code null} if there is no such a property in the configuration. + * @throws NullPointerException if either input is null. + */ + public static List getIntervals( + final Configuration conf, final String intervalPropertyName) { + final String intervalsProperty = conf.get(intervalPropertyName); + if (intervalsProperty == null) { + return null; + } + if (intervalsProperty.isEmpty()) { + return ImmutableList.of(); + } + final List intervals = new ArrayList<>(); + for (final String s : intervalsProperty.split(",")) { + final int lastColonIdx = s.lastIndexOf(':'); + if (lastColonIdx < 0) { + throw new FormatException("no colon found in interval string: " + s); + } + final int hyphenIdx = s.indexOf('-', lastColonIdx + 1); + if (hyphenIdx < 0) { + throw new FormatException("no hyphen found after colon interval string: " + s); + } + final String sequence = s.substring(0, lastColonIdx); + final int start = + parseIntOrThrowFormatException( + s.substring(lastColonIdx + 1, hyphenIdx), "invalid start position", s); + final int stop = + parseIntOrThrowFormatException(s.substring(hyphenIdx + 1), "invalid stop position", s); + intervals.add(new Interval(sequence, start, stop)); } + return intervals; + } - private static int parseIntOrThrowFormatException(final String str, final String error, final String input) { - try { - return Integer.parseInt(str); - } catch (final NumberFormatException ex) { - throw new FormatException(error + " in interval '" + input + "': '" + str + "'"); - } + private static int parseIntOrThrowFormatException( + final String str, final String error, final String input) { + try { + return Integer.parseInt(str); + } catch (final NumberFormatException ex) { + throw new FormatException(error + " in interval '" + input + "': '" + str + "'"); } + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/MurmurHash3.java b/src/main/java/org/seqdoop/hadoop_bam/util/MurmurHash3.java index 35e2be9..04413fa 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/MurmurHash3.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/MurmurHash3.java @@ -21,161 +21,221 @@ package org.seqdoop.hadoop_bam.util; import java.nio.ByteBuffer; -import java.nio.LongBuffer; import java.nio.ByteOrder; +import java.nio.LongBuffer; -/** This class implements a hash function giving the first 64 bits of the - * MurmurHash3_x64_128 hash. +/** + * This class implements a hash function giving the first 64 bits of the MurmurHash3_x64_128 hash. */ @SuppressWarnings("fallthrough") public final class MurmurHash3 { - public static long murmurhash3(byte[] key, int seed) { - - final ByteBuffer data = - ByteBuffer.wrap(key).order(ByteOrder.LITTLE_ENDIAN); - - final int len = key.length; - - final int nblocks = len / 16; - - long h1 = seed; - long h2 = seed; - - final long c1 = 0x87c37b91114253d5L; - final long c2 = 0x4cf5ad432745937fL; - - final LongBuffer blocks = data.asLongBuffer(); - - for (int i = 0; i < nblocks; ++i) { - long k1 = blocks.get(i*2 + 0); - long k2 = blocks.get(i*2 + 1); - - k1 *= c1; k1 = k1 << 31 | k1 >>> (64-31); k1 *= c2; h1 ^= k1; - - h1 = h1 << 27 | h1 >>> (64-27); h1 += h2; h1 = h1*5 + 0x52dce729; - - k2 *= c2; k2 = k2 << 33 | k2 >>> (64-33); k2 *= c1; h2 ^= k2; - - h2 = h2 << 31 | h1 >>> (64-31); h2 += h1; h2 = h2*5 + 0x38495ab5; - } - - data.position(nblocks * 16); - final ByteBuffer tail = data.slice(); - - long k1 = 0; - long k2 = 0; - - switch (len & 15) { - case 15: k2 ^= ((long)tail.get(14) & 0xff) << 48; - case 14: k2 ^= ((long)tail.get(13) & 0xff) << 40; - case 13: k2 ^= ((long)tail.get(12) & 0xff) << 32; - case 12: k2 ^= ((long)tail.get(11) & 0xff) << 24; - case 11: k2 ^= ((long)tail.get(10) & 0xff) << 16; - case 10: k2 ^= ((long)tail.get( 9) & 0xff) << 8; - case 9: k2 ^= ((long)tail.get( 8) & 0xff) << 0; - k2 *= c2; k2 = k2 << 33 | k2 >>> (64-33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= ((long)tail.get( 7) & 0xff) << 56; - case 7: k1 ^= ((long)tail.get( 6) & 0xff) << 48; - case 6: k1 ^= ((long)tail.get( 5) & 0xff) << 40; - case 5: k1 ^= ((long)tail.get( 4) & 0xff) << 32; - case 4: k1 ^= ((long)tail.get( 3) & 0xff) << 24; - case 3: k1 ^= ((long)tail.get( 2) & 0xff) << 16; - case 2: k1 ^= ((long)tail.get( 1) & 0xff) << 8; - case 1: k1 ^= ((long)tail.get( 0) & 0xff) << 0; - k1 *= c1; k1 = k1 << 31 | k1 >>> (64-31); k1 *= c2; h1 ^= k1; - case 0: break; - } - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - - h1 += h2; - // h2 += h1; - - return h1; - } - - /** This version hashes the characters directly. It is not equivalent to - * hashing chars.toString().getBytes(), as it hashes UTF-16 code units, but - * it is much faster. - */ - public static long murmurhash3(CharSequence chars, int seed) { - - final int len = chars.length(); - - final int nblocks = len / 8; - - long h1 = seed; - long h2 = seed; - - final long c1 = 0x87c37b91114253d5L; - final long c2 = 0x4cf5ad432745937fL; - - for (int i = 0; i < nblocks; ++i) { - int i0 = (i*2 + 0) * 4; - int i1 = (i*2 + 1) * 4; - - long k1 = (long)chars.charAt(i0) - | (long)chars.charAt(i0+1) << 16 - | (long)chars.charAt(i0+2) << 32 - | (long)chars.charAt(i0+3) << 48; - long k2 = (long)chars.charAt(i1) - | (long)chars.charAt(i1+1) << 16 - | (long)chars.charAt(i1+2) << 32 - | (long)chars.charAt(i1+3) << 48; - - k1 *= c1; k1 = k1 << 31 | k1 >>> (64-31); k1 *= c2; h1 ^= k1; - - h1 = h1 << 27 | h1 >>> (64-27); h1 += h2; h1 = h1*5 + 0x52dce729; - - k2 *= c2; k2 = k2 << 33 | k2 >>> (64-33); k2 *= c1; h2 ^= k2; - - h2 = h2 << 31 | h1 >>> (64-31); h2 += h1; h2 = h2*5 + 0x38495ab5; - } - - long k1 = 0; - long k2 = 0; - - switch (len & 7) { - case 7: k2 ^= (long)chars.charAt(6) << 32; - case 6: k2 ^= (long)chars.charAt(5) << 16; - case 5: k2 ^= (long)chars.charAt(4) << 0; - k2 *= c2; k2 = k2 << 33 | k2 >>> (64-33); k2 *= c1; h2 ^= k2; - - case 4: k1 ^= (long)chars.charAt(3) << 48; - case 3: k1 ^= (long)chars.charAt(2) << 32; - case 2: k1 ^= (long)chars.charAt(1) << 16; - case 1: k1 ^= (long)chars.charAt(0) << 0; - k1 *= c1; k1 = k1 << 31 | k1 >>> (64-31); k1 *= c2; h1 ^= k1; - case 0: break; - } - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - - h1 += h2; - // h2 += h1; - - return h1; - } - - private static long fmix(long k) { - k ^= k >>> 33; - k *= 0xff51afd7ed558ccdL; - k ^= k >>> 33; - k *= 0xc4ceb9fe1a85ec53L; - k ^= k >>> 33; - return k; - } + public static long murmurhash3(byte[] key, int seed) { + + final ByteBuffer data = ByteBuffer.wrap(key).order(ByteOrder.LITTLE_ENDIAN); + + final int len = key.length; + + final int nblocks = len / 16; + + long h1 = seed; + long h2 = seed; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + + final LongBuffer blocks = data.asLongBuffer(); + + for (int i = 0; i < nblocks; ++i) { + long k1 = blocks.get(i * 2 + 0); + long k2 = blocks.get(i * 2 + 1); + + k1 *= c1; + k1 = k1 << 31 | k1 >>> (64 - 31); + k1 *= c2; + h1 ^= k1; + + h1 = h1 << 27 | h1 >>> (64 - 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = k2 << 33 | k2 >>> (64 - 33); + k2 *= c1; + h2 ^= k2; + + h2 = h2 << 31 | h1 >>> (64 - 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + data.position(nblocks * 16); + final ByteBuffer tail = data.slice(); + + long k1 = 0; + long k2 = 0; + + switch (len & 15) { + case 15: + k2 ^= ((long) tail.get(14) & 0xff) << 48; + case 14: + k2 ^= ((long) tail.get(13) & 0xff) << 40; + case 13: + k2 ^= ((long) tail.get(12) & 0xff) << 32; + case 12: + k2 ^= ((long) tail.get(11) & 0xff) << 24; + case 11: + k2 ^= ((long) tail.get(10) & 0xff) << 16; + case 10: + k2 ^= ((long) tail.get(9) & 0xff) << 8; + case 9: + k2 ^= ((long) tail.get(8) & 0xff) << 0; + k2 *= c2; + k2 = k2 << 33 | k2 >>> (64 - 33); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= ((long) tail.get(7) & 0xff) << 56; + case 7: + k1 ^= ((long) tail.get(6) & 0xff) << 48; + case 6: + k1 ^= ((long) tail.get(5) & 0xff) << 40; + case 5: + k1 ^= ((long) tail.get(4) & 0xff) << 32; + case 4: + k1 ^= ((long) tail.get(3) & 0xff) << 24; + case 3: + k1 ^= ((long) tail.get(2) & 0xff) << 16; + case 2: + k1 ^= ((long) tail.get(1) & 0xff) << 8; + case 1: + k1 ^= ((long) tail.get(0) & 0xff) << 0; + k1 *= c1; + k1 = k1 << 31 | k1 >>> (64 - 31); + k1 *= c2; + h1 ^= k1; + case 0: + break; + } + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + // h2 += h1; + + return h1; + } + + /** + * This version hashes the characters directly. It is not equivalent to hashing + * chars.toString().getBytes(), as it hashes UTF-16 code units, but it is much faster. + */ + public static long murmurhash3(CharSequence chars, int seed) { + + final int len = chars.length(); + + final int nblocks = len / 8; + + long h1 = seed; + long h2 = seed; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + + for (int i = 0; i < nblocks; ++i) { + int i0 = (i * 2 + 0) * 4; + int i1 = (i * 2 + 1) * 4; + + long k1 = + (long) chars.charAt(i0) + | (long) chars.charAt(i0 + 1) << 16 + | (long) chars.charAt(i0 + 2) << 32 + | (long) chars.charAt(i0 + 3) << 48; + long k2 = + (long) chars.charAt(i1) + | (long) chars.charAt(i1 + 1) << 16 + | (long) chars.charAt(i1 + 2) << 32 + | (long) chars.charAt(i1 + 3) << 48; + + k1 *= c1; + k1 = k1 << 31 | k1 >>> (64 - 31); + k1 *= c2; + h1 ^= k1; + + h1 = h1 << 27 | h1 >>> (64 - 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = k2 << 33 | k2 >>> (64 - 33); + k2 *= c1; + h2 ^= k2; + + h2 = h2 << 31 | h1 >>> (64 - 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + long k1 = 0; + long k2 = 0; + + switch (len & 7) { + case 7: + k2 ^= (long) chars.charAt(6) << 32; + case 6: + k2 ^= (long) chars.charAt(5) << 16; + case 5: + k2 ^= (long) chars.charAt(4) << 0; + k2 *= c2; + k2 = k2 << 33 | k2 >>> (64 - 33); + k2 *= c1; + h2 ^= k2; + + case 4: + k1 ^= (long) chars.charAt(3) << 48; + case 3: + k1 ^= (long) chars.charAt(2) << 32; + case 2: + k1 ^= (long) chars.charAt(1) << 16; + case 1: + k1 ^= (long) chars.charAt(0) << 0; + k1 *= c1; + k1 = k1 << 31 | k1 >>> (64 - 31); + k1 *= c2; + h1 ^= k1; + case 0: + break; + } + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + // h2 += h1; + + return h1; + } + + private static long fmix(long k) { + k ^= k >>> 33; + k *= 0xff51afd7ed558ccdL; + k ^= k >>> 33; + k *= 0xc4ceb9fe1a85ec53L; + k ^= k >>> 33; + return k; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java b/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java index 065eef4..407ffb9 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/NIOFileUtil.java @@ -18,13 +18,14 @@ import java.util.stream.Collectors; public class NIOFileUtil { - private NIOFileUtil() { - } static final String PARTS_GLOB = "glob:**/part-[mr]-[0-9][0-9][0-9][0-9][0-9]*"; + private NIOFileUtil() {} + /** * Convert the given path {@link URI} to a {@link Path} object. + * * @param uri the path to convert * @return a {@link Path} object */ @@ -46,6 +47,7 @@ public static Path asPath(URI uri) { /** * Convert the given path string to a {@link Path} object. + * * @param path the path to convert * @return a {@link Path} object */ @@ -56,54 +58,57 @@ public static Path asPath(String path) { /** * Delete the given directory and all of its contents if non-empty. + * * @param directory the directory to delete - * @throws IOException */ static void deleteRecursive(Path directory) throws IOException { - Files.walkFileTree(directory, new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Files.delete(file); - return FileVisitResult.CONTINUE; - } - @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { - Files.deleteIfExists(dir); - return FileVisitResult.CONTINUE; - } - }); + Files.walkFileTree( + directory, + new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.deleteIfExists(dir); + return FileVisitResult.CONTINUE; + } + }); } /** - * Returns all the files in a directory that match the given pattern, and that don't - * have the given extension. - * @param directory the directory to look for files in, subdirectories are not - * considered - * @param syntaxAndPattern the syntax and pattern to use for matching (see - * {@link java.nio.file.FileSystem#getPathMatcher} + * Returns all the files in a directory that match the given pattern, and that don't have the + * given extension. + * + * @param directory the directory to look for files in, subdirectories are not considered + * @param syntaxAndPattern the syntax and pattern to use for matching (see {@link + * java.nio.file.FileSystem#getPathMatcher} * @param excludesExt the extension to exclude, or null to exclude nothing * @return a list of files, sorted by name - * @throws IOException */ - static List getFilesMatching(Path directory, - String syntaxAndPattern, String excludesExt) throws IOException { + static List getFilesMatching(Path directory, String syntaxAndPattern, String excludesExt) + throws IOException { PathMatcher matcher = directory.getFileSystem().getPathMatcher(syntaxAndPattern); - List parts = Files.walk(directory) - .filter(matcher::matches) - .filter(path -> excludesExt == null || !path.toString().endsWith(excludesExt)) - .collect(Collectors.toList()); + List parts = + Files.walk(directory) + .filter(matcher::matches) + .filter(path -> excludesExt == null || !path.toString().endsWith(excludesExt)) + .collect(Collectors.toList()); Collections.sort(parts); return parts; } /** * Merge the given part files in order into an output stream. + * * @param parts the part files to merge * @param out the stream to write each file into, in order - * @throws IOException */ - static void mergeInto(List parts, OutputStream out) - throws IOException { + static void mergeInto(List parts, OutputStream out) throws IOException { for (final Path part : parts) { Files.copy(part, out); } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/SAMFileMerger.java b/src/main/java/org/seqdoop/hadoop_bam/util/SAMFileMerger.java index 761d578..97a6b27 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/SAMFileMerger.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/SAMFileMerger.java @@ -1,5 +1,10 @@ package org.seqdoop.hadoop_bam.util; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.asPath; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.deleteRecursive; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.getFilesMatching; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.mergeInto; + import com.google.common.io.CountingOutputStream; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.cram.build.CramIO; @@ -20,31 +25,29 @@ import org.seqdoop.hadoop_bam.SplittingBAMIndex; import org.seqdoop.hadoop_bam.SplittingBAMIndexer; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.asPath; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.deleteRecursive; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.getFilesMatching; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.mergeInto; - /** - * Merges headerless BAM or CRAM files produced by {@link KeyIgnoringAnySAMOutputFormat} - * into a single file. + * Merges headerless BAM or CRAM files produced by {@link KeyIgnoringAnySAMOutputFormat} into a + * single file. */ public class SAMFileMerger { - private SAMFileMerger() { - } + private SAMFileMerger() {} /** - * Merge part file shards produced by {@link KeyIgnoringAnySAMOutputFormat} into a - * single file with the given header. + * Merge part file shards produced by {@link KeyIgnoringAnySAMOutputFormat} into a single file + * with the given header. + * * @param partDirectory the directory containing part files * @param outputFile the file to write the merged file to * @param samOutputFormat the format (must be BAM or CRAM; SAM is not supported) * @param header the header for the merged file - * @throws IOException */ - public static void mergeParts(final String partDirectory, final String outputFile, - final SAMFormat samOutputFormat, final SAMFileHeader header) throws IOException { + public static void mergeParts( + final String partDirectory, + final String outputFile, + final SAMFormat samOutputFormat, + final SAMFileHeader header) + throws IOException { // First, check for the _SUCCESS file. final Path partPath = asPath(partDirectory); @@ -54,22 +57,23 @@ public static void mergeParts(final String partDirectory, final String outputFil } final Path outputPath = asPath(outputFile); if (partPath.equals(outputPath)) { - throw new IllegalArgumentException("Cannot merge parts into output with same " + - "path: " + partPath); + throw new IllegalArgumentException( + "Cannot merge parts into output with same " + "path: " + partPath); } - List parts = getFilesMatching(partPath, NIOFileUtil.PARTS_GLOB, - SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + List parts = + getFilesMatching( + partPath, NIOFileUtil.PARTS_GLOB, SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); if (parts.isEmpty()) { - throw new IllegalArgumentException("Could not write bam file because no part " + - "files were found in " + partPath); + throw new IllegalArgumentException( + "Could not write bam file because no part " + "files were found in " + partPath); } Files.deleteIfExists(outputPath); long headerLength; try (final CountingOutputStream out = - new CountingOutputStream(Files.newOutputStream(outputPath))) { + new CountingOutputStream(Files.newOutputStream(outputPath))) { if (header != null) { new SAMOutputPreparer().prepareForRecords(out, samOutputFormat, header); // write the header } @@ -79,8 +83,9 @@ public static void mergeParts(final String partDirectory, final String outputFil } long fileLength = Files.size(outputPath); - final Path outputSplittingBaiPath = outputPath.resolveSibling( - outputPath.getFileName() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + final Path outputSplittingBaiPath = + outputPath.resolveSibling( + outputPath.getFileName() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); Files.deleteIfExists(outputSplittingBaiPath); try (final OutputStream out = Files.newOutputStream(outputSplittingBaiPath)) { mergeSplittingBaiFiles(out, partPath, headerLength, fileLength); @@ -92,8 +97,10 @@ public static void mergeParts(final String partDirectory, final String outputFil deleteRecursive(partPath); } - //Terminate the aggregated output stream with an appropriate SAMOutputFormat-dependent terminator block - private static void writeTerminatorBlock(final OutputStream out, final SAMFormat samOutputFormat) throws IOException { + // Terminate the aggregated output stream with an appropriate SAMOutputFormat-dependent terminator + // block + private static void writeTerminatorBlock(final OutputStream out, final SAMFormat samOutputFormat) + throws IOException { if (SAMFormat.CRAM == samOutputFormat) { CramIO.issueEOF(CramVersions.DEFAULT_CRAM_VERSION, out); // terminate with CRAM EOF container } else { @@ -101,10 +108,11 @@ private static void writeTerminatorBlock(final OutputStream out, final SAMFormat } } - static void mergeSplittingBaiFiles(OutputStream out, Path directory, long headerLength, - long fileLength) throws IOException { - final List parts = getFilesMatching(directory, - NIOFileUtil.PARTS_GLOB + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION, null); + static void mergeSplittingBaiFiles( + OutputStream out, Path directory, long headerLength, long fileLength) throws IOException { + final List parts = + getFilesMatching( + directory, NIOFileUtil.PARTS_GLOB + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION, null); if (parts.isEmpty()) { return; // nothing to merge } @@ -124,7 +132,6 @@ static void mergeSplittingBaiFiles(OutputStream out, Path directory, long header } } - SplittingBAMIndexer splittingBAMIndexer = new SplittingBAMIndexer(out); for (Long offset : mergedVirtualOffsets) { splittingBAMIndexer.writeVirtualOffset(offset); @@ -132,8 +139,11 @@ static void mergeSplittingBaiFiles(OutputStream out, Path directory, long header splittingBAMIndexer.finish(partFileOffset); int terminatingBlockLength = BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length; if (partFileOffset + terminatingBlockLength != fileLength) { - throw new IOException("Part file length mismatch. Last part file offset is " + - partFileOffset + ", expected: " + (fileLength - terminatingBlockLength)); + throw new IOException( + "Part file length mismatch. Last part file offset is " + + partFileOffset + + ", expected: " + + (fileLength - terminatingBlockLength)); } for (final Path part : parts) { diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java b/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java index b4d062d..9f92c90 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/SAMHeaderReader.java @@ -22,75 +22,65 @@ package org.seqdoop.hadoop_bam.util; -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.nio.file.Paths; - -import htsjdk.samtools.cram.ref.ReferenceSource; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; - import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import htsjdk.samtools.cram.ref.ReferenceSource; +import java.io.IOException; +import java.io.InputStream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.seqdoop.hadoop_bam.CRAMInputFormat; public final class SAMHeaderReader { - /** A String property corresponding to a ValidationStringency - * value. If set, the given stringency is used when any part of the - * Hadoop-BAM library reads SAM or BAM. - */ - public static final String VALIDATION_STRINGENCY_PROPERTY = - "hadoopbam.samheaderreader.validation-stringency"; - public static SAMFileHeader readSAMHeaderFrom(Path path, Configuration conf) - throws IOException - { - InputStream i = path.getFileSystem(conf).open(path); - final SAMFileHeader h = readSAMHeaderFrom(i, conf); - i.close(); - return h; - } + /** + * A String property corresponding to a ValidationStringency value. If set, the given stringency + * is used when any part of the Hadoop-BAM library reads SAM or BAM. + */ + public static final String VALIDATION_STRINGENCY_PROPERTY = + "hadoopbam.samheaderreader.validation-stringency"; + + public static SAMFileHeader readSAMHeaderFrom(Path path, Configuration conf) throws IOException { + InputStream i = path.getFileSystem(conf).open(path); + final SAMFileHeader h = readSAMHeaderFrom(i, conf); + i.close(); + return h; + } - /** Does not close the stream. */ - public static SAMFileHeader readSAMHeaderFrom( - final InputStream in, final Configuration conf) - { - final ValidationStringency - stringency = getValidationStringency(conf); - SamReaderFactory readerFactory = SamReaderFactory.makeDefault() - .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) - .setUseAsyncIo(false); - if (stringency != null) { - readerFactory.validationStringency(stringency); - } + /** Does not close the stream. */ + public static SAMFileHeader readSAMHeaderFrom(final InputStream in, final Configuration conf) { + final ValidationStringency stringency = getValidationStringency(conf); + SamReaderFactory readerFactory = + SamReaderFactory.makeDefault() + .setOption(SamReaderFactory.Option.EAGERLY_DECODE, false) + .setUseAsyncIo(false); + if (stringency != null) { + readerFactory.validationStringency(stringency); + } - final ReferenceSource refSource = getReferenceSource(conf); - if (null != refSource) { - readerFactory.referenceSource(refSource); - } - return readerFactory.open(SamInputResource.of(in)).getFileHeader(); - } + final ReferenceSource refSource = getReferenceSource(conf); + if (null != refSource) { + readerFactory.referenceSource(refSource); + } + return readerFactory.open(SamInputResource.of(in)).getFileHeader(); + } - public static ValidationStringency getValidationStringency( - final Configuration conf) - { - final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY); - return p == null ? null : ValidationStringency.valueOf(p); - } + public static ValidationStringency getValidationStringency(final Configuration conf) { + final String p = conf.get(VALIDATION_STRINGENCY_PROPERTY); + return p == null ? null : ValidationStringency.valueOf(p); + } - public static ReferenceSource getReferenceSource( - final Configuration conf) - { - //TODO: There isn't anything particularly CRAM-specific about reference source or validation - // stringency other than that a reference source is required for CRAM files. We should move - // the reference source and validation stringency property names and utility methods out of - // CRAMInputFormat and SAMHeaderReader and combine them together into a single class for extracting - // configuration params, but it would break backward compatibility with existing code that - // is dependent on the CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY. - final String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); - return refSourcePath == null ? null : new ReferenceSource(NIOFileUtil.asPath(refSourcePath)); - } + public static ReferenceSource getReferenceSource(final Configuration conf) { + // TODO: There isn't anything particularly CRAM-specific about reference source or validation + // stringency other than that a reference source is required for CRAM files. We should move + // the reference source and validation stringency property names and utility methods out of + // CRAMInputFormat and SAMHeaderReader and combine them together into a single class for + // extracting + // configuration params, but it would break backward compatibility with existing code that + // is dependent on the CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY. + final String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY); + return refSourcePath == null ? null : new ReferenceSource(NIOFileUtil.asPath(refSourcePath)); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java b/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java index 688b467..81eb8a8 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/SAMOutputPreparer.java @@ -22,6 +22,12 @@ package org.seqdoop.hadoop_bam.util; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SAMTextHeaderCodec; +import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.common.CramVersions; +import htsjdk.samtools.util.BlockCompressedOutputStream; import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -30,115 +36,102 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.List; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.SAMTextHeaderCodec; -import htsjdk.samtools.cram.build.CramIO; -import htsjdk.samtools.cram.common.CramVersions; -import htsjdk.samtools.util.BlockCompressedOutputStream; - import org.seqdoop.hadoop_bam.SAMFormat; public class SAMOutputPreparer { - private ByteBuffer buf; - - public SAMOutputPreparer() { - // Enough room for a 32-bit integer. - buf = ByteBuffer.wrap(new byte[4]); - buf.order(ByteOrder.LITTLE_ENDIAN); - } - - public static final byte[] BAM_MAGIC = {'B','A','M', 1}; - - /** Prepares the given output stream for writing of SAMRecords in the given - * format. This includes writing the given SAM header and, in the case of - * BAM or CRAM, writing some further metadata as well as compressing everything - * written. Returns a new stream to replace the original: it will do the - * appropriate compression for BAM/CRAM files. - */ - public OutputStream prepareForRecords( - OutputStream out, final SAMFormat format, - final SAMFileHeader header) - throws IOException { - - switch (format) { - case SAM: - out = prepareSAMOrBAMStream(out, format, header); - break; - case BAM: - out = prepareSAMOrBAMStream(out, format, header); - break; - case CRAM: - out = prepareCRAMStream(out, format, header); - break; - default: - throw new IllegalArgumentException - ("Unsupported SAM file format, must be one of SAM, BAM or CRAM"); - } - - // Important for BAM: if the caller doesn't want to use the new stream - // for some reason, the BlockCompressedOutputStream's buffer would never - // be flushed. - out.flush(); - return out; - } - - private OutputStream prepareCRAMStream( - OutputStream out, final SAMFormat format, - final SAMFileHeader header) throws IOException - { - CramIO.writeHeader(CramVersions.DEFAULT_CRAM_VERSION, out, header, null); - return out; - } - - private OutputStream prepareSAMOrBAMStream( - OutputStream out, final SAMFormat format, - final SAMFileHeader header) throws IOException - { - final StringWriter sw = new StringWriter(); - new SAMTextHeaderCodec().encode(sw, header); - final String text = sw.toString(); - - if (format == SAMFormat.BAM) { - out = new BlockCompressedOutputStream(out, null); - out.write(BAM_MAGIC); - writeInt32(out, text.length()); - } - - writeString(out, text); - - if (format == SAMFormat.BAM) { - final List refs = - header.getSequenceDictionary().getSequences(); - - writeInt32(out, refs.size()); - - for (final SAMSequenceRecord ref : refs) { - final String name = ref.getSequenceName(); - writeInt32(out, name.length() + 1); - writeString(out, name); - out.write(0); - writeInt32(out, ref.getSequenceLength()); - } - } - - return out; - } - - private static void writeString(final OutputStream out, final String s) - throws IOException - { - // Don't flush the underlying stream yet, only the writer: in the case of - // BAM, we might be able to cram more things into the gzip block still. - final OutputStreamWriter w = new OutputStreamWriter( - new FilterOutputStream(out) { @Override public void flush() {} } ); - w.write(s); - w.flush(); - } - - private void writeInt32(final OutputStream out, int n) throws IOException { - buf.putInt(0, n); - out.write(buf.array()); - } + + public static final byte[] BAM_MAGIC = {'B', 'A', 'M', 1}; + private ByteBuffer buf; + + public SAMOutputPreparer() { + // Enough room for a 32-bit integer. + buf = ByteBuffer.wrap(new byte[4]); + buf.order(ByteOrder.LITTLE_ENDIAN); + } + + private static void writeString(final OutputStream out, final String s) throws IOException { + // Don't flush the underlying stream yet, only the writer: in the case of + // BAM, we might be able to cram more things into the gzip block still. + final OutputStreamWriter w = + new OutputStreamWriter( + new FilterOutputStream(out) { + @Override + public void flush() {} + }); + w.write(s); + w.flush(); + } + + /** + * Prepares the given output stream for writing of SAMRecords in the given format. This includes + * writing the given SAM header and, in the case of BAM or CRAM, writing some further metadata as + * well as compressing everything written. Returns a new stream to replace the original: it will + * do the appropriate compression for BAM/CRAM files. + */ + public OutputStream prepareForRecords( + OutputStream out, final SAMFormat format, final SAMFileHeader header) throws IOException { + + switch (format) { + case SAM: + out = prepareSAMOrBAMStream(out, format, header); + break; + case BAM: + out = prepareSAMOrBAMStream(out, format, header); + break; + case CRAM: + out = prepareCRAMStream(out, format, header); + break; + default: + throw new IllegalArgumentException( + "Unsupported SAM file format, must be one of SAM, BAM or CRAM"); + } + + // Important for BAM: if the caller doesn't want to use the new stream + // for some reason, the BlockCompressedOutputStream's buffer would never + // be flushed. + out.flush(); + return out; + } + + private OutputStream prepareCRAMStream( + OutputStream out, final SAMFormat format, final SAMFileHeader header) throws IOException { + CramIO.writeHeader(CramVersions.DEFAULT_CRAM_VERSION, out, header, null); + return out; + } + + private OutputStream prepareSAMOrBAMStream( + OutputStream out, final SAMFormat format, final SAMFileHeader header) throws IOException { + final StringWriter sw = new StringWriter(); + new SAMTextHeaderCodec().encode(sw, header); + final String text = sw.toString(); + + if (format == SAMFormat.BAM) { + out = new BlockCompressedOutputStream(out, null); + out.write(BAM_MAGIC); + writeInt32(out, text.length()); + } + + writeString(out, text); + + if (format == SAMFormat.BAM) { + final List refs = header.getSequenceDictionary().getSequences(); + + writeInt32(out, refs.size()); + + for (final SAMSequenceRecord ref : refs) { + final String name = ref.getSequenceName(); + writeInt32(out, name.length() + 1); + writeString(out, name); + out.write(0); + writeInt32(out, ref.getSequenceLength()); + } + } + + return out; + } + + private void writeInt32(final OutputStream out, int n) throws IOException { + buf.putInt(0, n); + out.write(buf.array()); + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/VCFFileMerger.java b/src/main/java/org/seqdoop/hadoop_bam/util/VCFFileMerger.java index 81c8b9a..f6fd498 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/VCFFileMerger.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/VCFFileMerger.java @@ -1,5 +1,10 @@ package org.seqdoop.hadoop_bam.util; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.asPath; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.deleteRecursive; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.getFilesMatching; +import static org.seqdoop.hadoop_bam.util.NIOFileUtil.mergeInto; + import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.BlockCompressedStreamConstants; @@ -21,28 +26,25 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.asPath; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.deleteRecursive; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.getFilesMatching; -import static org.seqdoop.hadoop_bam.util.NIOFileUtil.mergeInto; - /** - * Merges headerless VCF files produced by {@link KeyIgnoringVCFOutputFormat} - * into a single file. BCF files are not supported. + * Merges headerless VCF files produced by {@link KeyIgnoringVCFOutputFormat} into a single file. + * BCF files are not supported. */ public class VCFFileMerger { + private static final Logger logger = LoggerFactory.getLogger(VCFFileMerger.class); /** - * Merge part file shards produced by {@link KeyIgnoringVCFOutputFormat} into a - * single file with the given header. + * Merge part file shards produced by {@link KeyIgnoringVCFOutputFormat} into a single file with + * the given header. + * * @param partDirectory the directory containing part files * @param outputFile the file to write the merged file to * @param header the header for the merged file - * @throws IOException */ - public static void mergeParts(final String partDirectory, final String outputFile, - final VCFHeader header) throws IOException { + public static void mergeParts( + final String partDirectory, final String outputFile, final VCFHeader header) + throws IOException { // First, check for the _SUCCESS file. final Path partPath = asPath(partDirectory); final Path successPath = partPath.resolve("_SUCCESS"); @@ -51,15 +53,15 @@ public static void mergeParts(final String partDirectory, final String outputFil } final Path outputPath = asPath(outputFile); if (partPath.equals(outputPath)) { - throw new IllegalArgumentException("Cannot merge parts into output with same " + - "path: " + partPath); + throw new IllegalArgumentException( + "Cannot merge parts into output with same " + "path: " + partPath); } - List parts = getFilesMatching(partPath, NIOFileUtil.PARTS_GLOB, - TabixUtils.STANDARD_INDEX_EXTENSION); + List parts = + getFilesMatching(partPath, NIOFileUtil.PARTS_GLOB, TabixUtils.STANDARD_INDEX_EXTENSION); if (parts.isEmpty()) { - throw new IllegalArgumentException("Could not write bam file because no part " + - "files were found in " + partPath); + throw new IllegalArgumentException( + "Could not write bam file because no part " + "files were found in " + partPath); } else if (isBCF(parts)) { throw new IllegalArgumentException("BCF files are not supported."); } @@ -77,20 +79,20 @@ public static void mergeParts(final String partDirectory, final String outputFil deleteRecursive(partPath); } - /** - * @return whether the output is block compressed - */ - private static boolean writeHeader(OutputStream out, Path outputPath, List parts, - VCFHeader header) throws IOException { + /** @return whether the output is block compressed */ + private static boolean writeHeader( + OutputStream out, Path outputPath, List parts, VCFHeader header) throws IOException { if (header == null) { return false; } boolean blockCompressed = isBlockCompressed(parts); boolean bgzExtension = outputPath.toString().endsWith(BGZFCodec.DEFAULT_EXTENSION); if (blockCompressed && !bgzExtension) { - logger.warn("Parts are block compressed, but output does not have .bgz extension: {}", outputPath); + logger.warn( + "Parts are block compressed, but output does not have .bgz extension: {}", outputPath); } else if (!blockCompressed && bgzExtension) { - logger.warn("Output has a .bgz extension, but parts are not block compressed: {}", outputPath); + logger.warn( + "Output has a .bgz extension, but parts are not block compressed: {}", outputPath); } boolean gzipCompressed = isGzipCompressed(parts); OutputStream headerOut; @@ -101,8 +103,8 @@ private static boolean writeHeader(OutputStream out, Path outputPath, List } else { headerOut = out; } - VariantContextWriter writer = new VariantContextWriterBuilder().clearOptions() - .setOutputVCFStream(headerOut).build(); + VariantContextWriter writer = + new VariantContextWriterBuilder().clearOptions().setOutputVCFStream(headerOut).build(); writer.writeHeader(header); headerOut.flush(); if (headerOut instanceof GZIPOutputStream) { diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java b/src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java index b59619a..9089146 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/VCFHeaderReader.java @@ -22,13 +22,8 @@ package org.seqdoop.hadoop_bam.util; -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.io.IOException; - import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedInputStream; - import htsjdk.tribble.FeatureCodecHeader; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.AsciiLineReader; @@ -37,43 +32,43 @@ import htsjdk.variant.bcf2.BCF2Codec; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; import java.util.zip.GZIPInputStream; import org.seqdoop.hadoop_bam.VCFFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Can read a VCF header without being told beforehand whether the input is - * VCF or BCF. - */ +/** Can read a VCF header without being told beforehand whether the input is VCF or BCF. */ public final class VCFHeaderReader { - private static final Logger logger = LoggerFactory.getLogger(VCFHeaderReader.class); - public static VCFHeader readHeaderFrom(final SeekableStream in) - throws IOException - { - Object headerCodec = null; - Object header = null; - final long initialPos = in.position(); - try { - BufferedInputStream bis = new BufferedInputStream(in); - InputStream is = VCFFormat.isGzip(bis) ? new GZIPInputStream(bis) : bis; - headerCodec = new VCFCodec().readHeader(new AsciiLineReaderIterator(new AsciiLineReader(is))); - } catch (TribbleException e) { - logger.warn("Exception while trying to read VCF header from file:", e); + private static final Logger logger = LoggerFactory.getLogger(VCFHeaderReader.class); + + public static VCFHeader readHeaderFrom(final SeekableStream in) throws IOException { + Object headerCodec = null; + Object header = null; + final long initialPos = in.position(); + try { + BufferedInputStream bis = new BufferedInputStream(in); + InputStream is = VCFFormat.isGzip(bis) ? new GZIPInputStream(bis) : bis; + headerCodec = new VCFCodec().readHeader(new AsciiLineReaderIterator(new AsciiLineReader(is))); + } catch (TribbleException e) { + logger.warn("Exception while trying to read VCF header from file:", e); - in.seek(initialPos); + in.seek(initialPos); - InputStream bin = new BufferedInputStream(in); - if (BlockCompressedInputStream.isValidFile(bin)) - bin = new BlockCompressedInputStream(bin); + InputStream bin = new BufferedInputStream(in); + if (BlockCompressedInputStream.isValidFile(bin)) { + bin = new BlockCompressedInputStream(bin); + } - headerCodec = - new BCF2Codec().readHeader( - new PositionalBufferedStream(bin)); - } - if (!(headerCodec instanceof FeatureCodecHeader)) - throw new IOException("No VCF header found"); - header = ((FeatureCodecHeader)headerCodec).getHeaderValue(); - return (VCFHeader)header; - } + headerCodec = new BCF2Codec().readHeader(new PositionalBufferedStream(bin)); + } + if (!(headerCodec instanceof FeatureCodecHeader)) { + throw new IOException("No VCF header found"); + } + header = ((FeatureCodecHeader) headerCodec).getHeaderValue(); + return (VCFHeader) header; + } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java b/src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java index b61827a..c5dcb4f 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java +++ b/src/main/java/org/seqdoop/hadoop_bam/util/WrapSeekable.java @@ -22,66 +22,81 @@ package org.seqdoop.hadoop_bam.util; -import java.io.InputStream; +import htsjdk.samtools.seekablestream.SeekableStream; import java.io.IOException; - +import java.io.InputStream; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; -import htsjdk.samtools.seekablestream.SeekableStream; - -/** Wraps Hadoop's "seekable stream" abstraction so that we can give such a one - * to BlockCompressedInputStream and retain seekability. +/** + * Wraps Hadoop's "seekable stream" abstraction so that we can give such a one to + * BlockCompressedInputStream and retain seekability. * - *

This is necessary because Hadoop and the SAM tools each have their own - * "seekable stream" abstraction.

+ *

This is necessary because Hadoop and the SAM tools each have their own "seekable stream" + * abstraction. */ -public class WrapSeekable - extends SeekableStream -{ - private final S stm; - private final long len; - private final Path path; - - public WrapSeekable(final S s, long length, Path p) { - stm = s; - len = length; - path = p; - } - - /** A helper for the common use case. */ - public static WrapSeekable openPath( - FileSystem fs, Path p) throws IOException - { - return new WrapSeekable( - fs.open(p), fs.getFileStatus(p).getLen(), p); - } - public static WrapSeekable openPath( - Configuration conf, Path path) throws IOException - { - return openPath(path.getFileSystem(conf), path); - } - - @Override public String getSource() { return path.toString(); } - @Override public long length () { return len; } - - @Override public long position() throws IOException { return stm.getPos(); } - @Override public void close() throws IOException { stm.close(); } - @Override public boolean eof () throws IOException { - return stm.getPos() == length(); - } - @Override public void seek(long pos) throws IOException { - stm.seek(pos); - } - @Override public int read() throws IOException { - return stm.read(); - } - @Override public int read(byte[] buf, int offset, int len) - throws IOException - { - return stm.read(buf, offset, len); - } +public class WrapSeekable extends SeekableStream { + + private final S stm; + private final long len; + private final Path path; + + public WrapSeekable(final S s, long length, Path p) { + stm = s; + len = length; + path = p; + } + + /** A helper for the common use case. */ + public static WrapSeekable openPath(FileSystem fs, Path p) throws IOException { + return new WrapSeekable(fs.open(p), fs.getFileStatus(p).getLen(), p); + } + + public static WrapSeekable openPath(Configuration conf, Path path) + throws IOException { + return openPath(path.getFileSystem(conf), path); + } + + @Override + public String getSource() { + return path.toString(); + } + + @Override + public long length() { + return len; + } + + @Override + public long position() throws IOException { + return stm.getPos(); + } + + @Override + public void close() throws IOException { + stm.close(); + } + + @Override + public boolean eof() throws IOException { + return stm.getPos() == length(); + } + + @Override + public void seek(long pos) throws IOException { + stm.seek(pos); + } + + @Override + public int read() throws IOException { + return stm.read(); + } + + @Override + public int read(byte[] buf, int offset, int len) throws IOException { + return stm.read(buf, offset, len); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java b/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java index dda53a1..a179325 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java +++ b/src/test/java/org/seqdoop/hadoop_bam/BAMTestUtil.java @@ -13,6 +13,7 @@ import java.io.IOException; class BAMTestUtil { + public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder) throws IOException { // file will be both queryname and coordinate sorted, so use one or the other @@ -22,31 +23,26 @@ public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder) int start1 = (i + 1) * 1000; int start2 = start1 + 100; if (i == 5) { // add two unmapped fragments instead of a mapped pair - samRecordSetBuilder.addFrag(String.format("test-read-%03d-1", i), chr, start1, - false, true, null, - null, - -1, false); - samRecordSetBuilder.addFrag(String.format("test-read-%03d-2", i), chr, start2, - false, true, null, - null, - -1, false); + samRecordSetBuilder.addFrag( + String.format("test-read-%03d-1", i), chr, start1, false, true, null, null, -1, false); + samRecordSetBuilder.addFrag( + String.format("test-read-%03d-2", i), chr, start2, false, true, null, null, -1, false); } else { - samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, - start2); + samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, start2); } } if (numPairs > 0) { // add two unplaced unmapped fragments if non-empty - samRecordSetBuilder.addUnmappedFragment(String.format - ("test-read-%03d-unplaced-unmapped", numPairs++)); - samRecordSetBuilder.addUnmappedFragment(String.format - ("test-read-%03d-unplaced-unmapped", numPairs++)); + samRecordSetBuilder.addUnmappedFragment( + String.format("test-read-%03d-unplaced-unmapped", numPairs++)); + samRecordSetBuilder.addUnmappedFragment( + String.format("test-read-%03d-unplaced-unmapped", numPairs++)); } final File bamFile = File.createTempFile("test", ".bam"); bamFile.deleteOnExit(); SAMFileHeader samHeader = samRecordSetBuilder.getHeader(); - final SAMFileWriter bamWriter = new SAMFileWriterFactory() - .makeSAMOrBAMWriter(samHeader, true, bamFile); + final SAMFileWriter bamWriter = + new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, true, bamFile); for (final SAMRecord rec : samRecordSetBuilder.getRecords()) { bamWriter.addAlignment(rec); } @@ -54,11 +50,13 @@ public static File writeBamFile(int numPairs, SAMFileHeader.SortOrder sortOrder) // create BAM index if (sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { - SamReader samReader = SamReaderFactory.makeDefault() - .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS) - .open(bamFile); - BAMIndexer.createIndex(samReader, new File(bamFile.getAbsolutePath() - .replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix))); + SamReader samReader = + SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS) + .open(bamFile); + BAMIndexer.createIndex( + samReader, + new File(bamFile.getAbsolutePath().replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix))); } return bamFile; @@ -71,8 +69,7 @@ public static File writeBamFileWithLargeHeader() throws IOException { int chr = 20; int start1 = (i + 1) * 1000; int start2 = start1 + 100; - samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, - start2); + samRecordSetBuilder.addPair(String.format("test-read-%03d", i), chr, start1, start2); } final File bamFile = File.createTempFile("test", ".bam"); @@ -80,11 +77,12 @@ public static File writeBamFileWithLargeHeader() throws IOException { SAMFileHeader samHeader = samRecordSetBuilder.getHeader(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < 1000000; i++) { - sb.append("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"); + sb.append( + "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"); } samHeader.addComment(sb.toString()); - final SAMFileWriter bamWriter = new SAMFileWriterFactory() - .makeSAMOrBAMWriter(samHeader, true, bamFile); + final SAMFileWriter bamWriter = + new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, true, bamFile); for (final SAMRecord rec : samRecordSetBuilder.getRecords()) { bamWriter.addAlignment(rec); } diff --git a/src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java b/src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java index 52da0c6..59baa37 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java +++ b/src/test/java/org/seqdoop/hadoop_bam/IntervalUtilTest.java @@ -1,75 +1,74 @@ package org.seqdoop.hadoop_bam; import htsjdk.samtools.util.Interval; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.hadoop.conf.Configuration; import org.junit.Assert; import org.junit.Test; import org.seqdoop.hadoop_bam.util.IntervalUtil; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** - * Unit tests for {@link IntervalUtil}. - */ +/** Unit tests for {@link IntervalUtil}. */ public class IntervalUtilTest { - @Test - public void testInvalidIntervals() { - final String[] invalidIntervals = { - "chr1", // full sequence interval are not allowed. - "chr1:12", // single position omitting stop is not allowed. - "chr1,chr2:121-123", // , are not allowed anywhere - "chr20:1,100-3,400", // , " " - "MT:35+", // , until end of contig + is not allowed. - "MT:13-31-1112", // too many positions. - "MT:-2112", // forgot the start position! - " MT : 113 - 1245" // blanks are not allowed either. - }; - for (final String interval : invalidIntervals) { - final Configuration conf = new Configuration(); - conf.set("prop-name", interval); - try { - IntervalUtil.getIntervals(conf, "prop-name"); - Assert.fail("expected an exception when dealing with '" + interval + "'"); - } catch (final FormatException ex) { - // fine. - } - } + @Test + public void testInvalidIntervals() { + final String[] invalidIntervals = { + "chr1", // full sequence interval are not allowed. + "chr1:12", // single position omitting stop is not allowed. + "chr1,chr2:121-123", // , are not allowed anywhere + "chr20:1,100-3,400", // , " " + "MT:35+", // , until end of contig + is not allowed. + "MT:13-31-1112", // too many positions. + "MT:-2112", // forgot the start position! + " MT : 113 - 1245" // blanks are not allowed either. + }; + for (final String interval : invalidIntervals) { + final Configuration conf = new Configuration(); + conf.set("prop-name", interval); + try { + IntervalUtil.getIntervals(conf, "prop-name"); + Assert.fail("expected an exception when dealing with '" + interval + "'"); + } catch (final FormatException ex) { + // fine. + } } + } - @Test - public void testValidIntervals() { - final Object[][] validIntervals = { - {"chr1:1-343", "chr1", 1, 343}, // standard 'chr' starting contig interval. - {"chr20_Un:31-145", "chr20_Un", 31, 145}, // standard chromosome name containing underscore. - {"X:31-145", "X", 31, 145}, // standard 'X' chromosome interval. - {"10:45000012-678901123", "10", 45000012, 678901123}, // standard number starting chromosome name interval. - {"HLA-DQA1*01:01:02:134-14151", "HLA-DQA1*01:01:02", 134, 14151}}; // example of a Hg38 assembly - // HLA contigs including - and : in their names. + @Test + public void testValidIntervals() { + final Object[][] validIntervals = { + {"chr1:1-343", "chr1", 1, 343}, // standard 'chr' starting contig interval. + {"chr20_Un:31-145", "chr20_Un", 31, 145}, // standard chromosome name containing underscore. + {"X:31-145", "X", 31, 145}, // standard 'X' chromosome interval. + {"10:45000012-678901123", "10", 45000012, 678901123}, + // standard number starting chromosome name interval. + {"HLA-DQA1*01:01:02:134-14151", "HLA-DQA1*01:01:02", 134, 14151} + }; // example of a Hg38 assembly + // HLA contigs including - and : in their names. - final Configuration conf = new Configuration(); + final Configuration conf = new Configuration(); - Assert.assertNull(IntervalUtil.getIntervals(conf, "prop-name")); + Assert.assertNull(IntervalUtil.getIntervals(conf, "prop-name")); - conf.set("prop-name", ""); + conf.set("prop-name", ""); - Assert.assertNotNull(IntervalUtil.getIntervals(conf, "prop-name")); - Assert.assertTrue(IntervalUtil.getIntervals(conf, "prop-name").isEmpty()); + Assert.assertNotNull(IntervalUtil.getIntervals(conf, "prop-name")); + Assert.assertTrue(IntervalUtil.getIntervals(conf, "prop-name").isEmpty()); - conf.set("prop-name", Stream.of(validIntervals) - .map(o -> (String) o[0]).collect(Collectors.joining(","))); + conf.set( + "prop-name", + Stream.of(validIntervals).map(o -> (String) o[0]).collect(Collectors.joining(","))); - final List allIntervals = IntervalUtil.getIntervals(conf, "prop-name"); - Assert.assertNotNull(allIntervals); - Assert.assertEquals(allIntervals.size(), validIntervals.length); - for (int i = 0; i < validIntervals.length; i++) { - Assert.assertNotNull(allIntervals.get(i)); - Assert.assertEquals(allIntervals.get(i).getContig(), validIntervals[i][1]); - Assert.assertEquals(allIntervals.get(i).getStart(), validIntervals[i][2]); - Assert.assertEquals(allIntervals.get(i).getEnd(), validIntervals[i][3]); - } + final List allIntervals = IntervalUtil.getIntervals(conf, "prop-name"); + Assert.assertNotNull(allIntervals); + Assert.assertEquals(allIntervals.size(), validIntervals.length); + for (int i = 0; i < validIntervals.length; i++) { + Assert.assertNotNull(allIntervals.get(i)); + Assert.assertEquals(allIntervals.get(i).getContig(), validIntervals[i][1]); + Assert.assertEquals(allIntervals.get(i).getStart(), validIntervals[i][2]); + Assert.assertEquals(allIntervals.get(i).getEnd(), validIntervals[i][3]); } - + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java index b3b220a..7fb4221 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestAnySAMInputFormat.java @@ -1,33 +1,34 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathNotFoundException; import org.junit.Test; -import static org.junit.Assert.assertEquals; - public class TestAnySAMInputFormat { - @Test - public void testHeaderlessSamFormat() throws PathNotFoundException { - final SAMFormat result = getSamFormat(new Configuration(), "test_headerless.sam"); - assertEquals(SAMFormat.SAM, result); - } + @Test + public void testHeaderlessSamFormat() throws PathNotFoundException { + final SAMFormat result = getSamFormat(new Configuration(), "test_headerless.sam"); + assertEquals(SAMFormat.SAM, result); + } - @Test - public void testTrustExtensionsIsHonored() throws PathNotFoundException { - final Configuration conf = new Configuration(); - //default to trusting exceptions - assertEquals(SAMFormat.SAM, getSamFormat(conf, "misnamedBam.sam")); + @Test + public void testTrustExtensionsIsHonored() throws PathNotFoundException { + final Configuration conf = new Configuration(); + // default to trusting exceptions + assertEquals(SAMFormat.SAM, getSamFormat(conf, "misnamedBam.sam")); - conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "false"); - final SAMFormat result = getSamFormat(conf, "misnamedBam.sam"); - assertEquals(SAMFormat.BAM, result); - } + conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "false"); + final SAMFormat result = getSamFormat(conf, "misnamedBam.sam"); + assertEquals(SAMFormat.BAM, result); + } - private SAMFormat getSamFormat(final Configuration conf, final String file) throws PathNotFoundException { - final String filePath = getClass().getClassLoader().getResource(file).getFile(); - return new AnySAMInputFormat(conf).getFormat(new Path(filePath)); - } + private SAMFormat getSamFormat(final Configuration conf, final String file) + throws PathNotFoundException { + final String filePath = getClass().getClassLoader().getResource(file).getFile(); + return new AnySAMInputFormat(conf).getFormat(new Path(filePath)); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestBAMInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestBAMInputFormat.java index ed7f570..f79cceb 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestBAMInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestBAMInputFormat.java @@ -1,5 +1,9 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.BAMIndex; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; @@ -19,11 +23,8 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestBAMInputFormat { + private String input; private TaskAttemptContext taskAttemptContext; private JobContext jobContext; @@ -36,13 +37,13 @@ private void completeSetupWithIntervals(List intervals) { completeSetupWithBoundedTraversal(intervals, false); } - private void completeSetupWithBoundedTraversal(List intervals, boolean - traverseUnplacedUnmapped) { + private void completeSetupWithBoundedTraversal( + List intervals, boolean traverseUnplacedUnmapped) { completeSetup(true, intervals, traverseUnplacedUnmapped); } - private void completeSetup(boolean boundedTraversal, List intervals, boolean - traverseUnplacedUnmapped) { + private void completeSetup( + boolean boundedTraversal, List intervals, boolean traverseUnplacedUnmapped) { Configuration conf = new Configuration(); conf.set("mapred.input.dir", "file://" + input); if (boundedTraversal) { @@ -63,8 +64,7 @@ public void testNoReadsInFirstSplitBug() throws Exception { @Test public void testMultipleSplits() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname).getAbsolutePath(); completeSetup(); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); @@ -83,8 +83,7 @@ public void testMultipleSplits() throws Exception { @Test public void testMultipleSplitsBaiEnabled() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); @@ -101,8 +100,7 @@ public void testMultipleSplitsBaiEnabled() throws Exception { @Test public void testMultipleSplitsBaiEnabledSuffixPath() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); File index = new File(input.replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)); index.renameTo(new File(input + BAMIndex.BAMIndexSuffix)); completeSetup(); @@ -121,8 +119,7 @@ public void testMultipleSplitsBaiEnabledSuffixPath() throws Exception { @Test public void testMultipleSplitsBaiEnabledNoIndex() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname).getAbsolutePath(); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); @@ -142,8 +139,7 @@ public void testMultipleSplitsBaiEnabledNoIndex() throws Exception { @Test public void testIntervals() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); List intervals = new ArrayList(); intervals.add(new Interval("chr21", 5000, 9999)); // includes two unpaired fragments intervals.add(new Interval("chr21", 20000, 22999)); @@ -160,8 +156,7 @@ public void testIntervals() throws Exception { @Test public void testIntervalCoveringWholeChromosome() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); List intervals = new ArrayList(); intervals.add(new Interval("chr21", 1, 1000135)); @@ -179,8 +174,7 @@ public void testIntervalCoveringWholeChromosome() throws Exception { @Test public void testIntervalsAndUnmapped() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); List intervals = new ArrayList(); intervals.add(new Interval("chr21", 5000, 9999)); // includes two unpaired fragments intervals.add(new Interval("chr21", 20000, 22999)); @@ -199,8 +193,7 @@ public void testIntervalsAndUnmapped() throws Exception { @Test public void testUnmapped() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); completeSetupWithBoundedTraversal(null, true); @@ -214,17 +207,16 @@ public void testUnmapped() throws Exception { @Test(expected = IllegalArgumentException.class) public void testMappedOnly() throws Exception { - input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) - .getAbsolutePath(); + input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate).getAbsolutePath(); // Mapped only (-XL unmapped) is currently unsupported and throws an exception. completeSetupWithBoundedTraversal(null, false); } - private List getSAMRecordsFromSplit(BAMInputFormat inputFormat, - InputSplit split) throws Exception { - RecordReader reader = inputFormat - .createRecordReader(split, taskAttemptContext); + private List getSAMRecordsFromSplit(BAMInputFormat inputFormat, InputSplit split) + throws Exception { + RecordReader reader = + inputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); List records = new ArrayList(); diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestBAMOutputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestBAMOutputFormat.java index 357cec2..8251660 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestBAMOutputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestBAMOutputFormat.java @@ -1,11 +1,17 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.*; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.BlockCompressedStreamConstants; +import java.io.*; import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -23,317 +29,303 @@ import org.junit.Test; import org.seqdoop.hadoop_bam.util.SAMFileMerger; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; - -import java.io.*; -import java.util.Iterator; import org.seqdoop.hadoop_bam.util.SAMOutputPreparer; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestBAMOutputFormat { - private String testBAMFileName; - - private int expectedRecordCount; - private SAMFileHeader samFileHeader; - - private TaskAttemptContext taskAttemptContext; - static private Configuration conf; - - // BAM output class that writes a header before records - static class BAMTestWithHeaderOutputFormat - extends KeyIgnoringBAMOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestBAM.header"; - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, - Path outputPath) throws IOException { - readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); - setWriteHeader(true); - return super.getRecordWriter(ctx, outputPath); - } - } - // BAM output class that doesn't write a header before records - static class BAMTestNoHeaderOutputFormat - extends KeyIgnoringBAMOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestBAM.header"; - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, - Path outputPath) throws IOException { - // the writers require a header in order to create a codec, even if - // the header isn't being written out - readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); - setWriteHeader(false); - return super.getRecordWriter(ctx, outputPath); - } - } + private static Configuration conf; + private String testBAMFileName; + private int expectedRecordCount; + private SAMFileHeader samFileHeader; + private TaskAttemptContext taskAttemptContext; - @Before - public void setup() throws Exception { - conf = new Configuration(); + @Before + public void setup() throws Exception { + conf = new Configuration(); - testBAMFileName = ClassLoader.getSystemClassLoader() - .getResource("test.bam").getFile(); + testBAMFileName = ClassLoader.getSystemClassLoader().getResource("test.bam").getFile(); - conf.set("mapred.input.dir", "file://" + testBAMFileName); + conf.set("mapred.input.dir", "file://" + testBAMFileName); - // fetch the SAMFile header from the original input to get the expected count - expectedRecordCount = getBAMRecordCount(new File(testBAMFileName)); - samFileHeader = SAMHeaderReader.readSAMHeaderFrom(new Path(testBAMFileName), conf); + // fetch the SAMFile header from the original input to get the expected count + expectedRecordCount = getBAMRecordCount(new File(testBAMFileName)); + samFileHeader = SAMHeaderReader.readSAMHeaderFrom(new Path(testBAMFileName), conf); - taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); - } + taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); + } - @Test - public void testBAMRecordWriterNoHeader() throws Exception { - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - outFile.deleteOnExit(); - final Path outPath = new Path(outFile.toURI()); - - final BAMTestNoHeaderOutputFormat bamOut = new BAMTestNoHeaderOutputFormat(); - conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testBAMFileName); - bamOut.setWriteHeader(false); - - RecordWriter rw = - bamOut.getRecordWriter(taskAttemptContext, outPath); - - final SamReader samReader = SamReaderFactory.makeDefault() - .open(new File(testBAMFileName)); - - for (final SAMRecord r : samReader) { - final SAMRecordWritable samRW = new SAMRecordWritable(); - samRW.set(r); - rw.write(null, samRW); - } - samReader.close(); - rw.close(taskAttemptContext); - - // now verify the output - final int actualCount = getBAMRecordCount(outFile, samFileHeader); - assertEquals(expectedRecordCount, actualCount); - } + @Test + public void testBAMRecordWriterNoHeader() throws Exception { + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + outFile.deleteOnExit(); + final Path outPath = new Path(outFile.toURI()); - @Test - public void testBAMRecordWriterWithHeader() throws Exception { - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - outFile.deleteOnExit(); - final Path outPath = new Path(outFile.toURI()); - - final BAMTestWithHeaderOutputFormat bamOut = new BAMTestWithHeaderOutputFormat(); - conf.set(BAMTestWithHeaderOutputFormat.READ_HEADER_FROM_FILE, testBAMFileName); - bamOut.setWriteHeader(false); - - RecordWriter rw = - bamOut.getRecordWriter(taskAttemptContext, outPath); - - final SamReader samReader = SamReaderFactory.makeDefault() - .open(new File(testBAMFileName)); - - for (final SAMRecord r : samReader) { - final SAMRecordWritable samRW = new SAMRecordWritable(); - samRW.set(r); - rw.write(null, samRW); - } - samReader.close(); - rw.close(taskAttemptContext); - - // now verify the output - final int actualCount = getBAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); - } + final BAMTestNoHeaderOutputFormat bamOut = new BAMTestNoHeaderOutputFormat(); + conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testBAMFileName); + bamOut.setWriteHeader(false); - @Test - public void testBAMOutput() throws Exception { - final Path outputPath = doMapReduce(testBAMFileName); - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.BAM, samFileHeader); - final int actualCount = getBAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); - } + RecordWriter rw = + bamOut.getRecordWriter(taskAttemptContext, outPath); - @Test - public void testEmptyBAM() throws Exception { - String bam = BAMTestUtil.writeBamFile(0, - SAMFileHeader.SortOrder.coordinate).toURI().toString(); - conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, true); - final Path outputPath = doMapReduce(bam); - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.BAM, new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate).getHeader()); - final int actualCount = getBAMRecordCount(outFile); - assertEquals(0, actualCount); - } + final SamReader samReader = SamReaderFactory.makeDefault().open(new File(testBAMFileName)); - @Test - public void testBAMWithSplittingBai() throws Exception { - int numPairs = 20000; - // create a large BAM with lots of index points - String bam = BAMTestUtil.writeBamFile(20000, - SAMFileHeader.SortOrder.coordinate).toURI().toString(); - conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 800000); // force multiple parts - conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, true); - final Path outputPath = doMapReduce(bam); - - List recordsAtSplits = new ArrayList<>(); - File[] splittingIndexes = new File(outputPath.toUri()).listFiles(pathname -> { - return pathname.getName().endsWith(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); - }); - Arrays.sort(splittingIndexes); // ensure files are sorted by name - for (File file : splittingIndexes) { - File bamFile = new File(file.getParentFile(), - file.getName().replace(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION, "")); - SplittingBAMIndex index = new SplittingBAMIndex(file); - recordsAtSplits.addAll(getRecordsAtSplits(bamFile, index)); - } - - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - //outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.BAM, - new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate).getHeader()); - - final int actualCount = getBAMRecordCount(outFile); - assertEquals(numPairs * 2 + 2, actualCount); // 2 unmapped reads - - File splittingBai = new File(outFile.getParentFile(), outFile.getName() + - SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); - SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingBai); - - assertEquals(recordsAtSplits, getRecordsAtSplits(outFile, splittingBAMIndex)); + for (final SAMRecord r : samReader) { + final SAMRecordWritable samRW = new SAMRecordWritable(); + samRW.set(r); + rw.write(null, samRW); } + samReader.close(); + rw.close(taskAttemptContext); - private List getRecordsAtSplits(File bam, SplittingBAMIndex index) throws IOException { - List records = new ArrayList<>(); - BAMRecordCodec codec = new BAMRecordCodec(samFileHeader); - BlockCompressedInputStream bci = new BlockCompressedInputStream(bam); - codec.setInputStream(bci); - for (Long offset : index.getVirtualOffsets()) { - bci.seek(offset); - SAMRecord record = codec.decode(); - if (record != null) { - records.add(record); - } - } - return records; - } + // now verify the output + final int actualCount = getBAMRecordCount(outFile, samFileHeader); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testBAMRecordWriterWithHeader() throws Exception { + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + outFile.deleteOnExit(); + final Path outPath = new Path(outFile.toURI()); - @Test - public void testBAMRoundTrip() throws Exception { - // run a m/r job to write out a bam file - Path outputPath = doMapReduce(testBAMFileName); + final BAMTestWithHeaderOutputFormat bamOut = new BAMTestWithHeaderOutputFormat(); + conf.set(BAMTestWithHeaderOutputFormat.READ_HEADER_FROM_FILE, testBAMFileName); + bamOut.setWriteHeader(false); - // merge the parts, and write to a temp file - final File outFile = File.createTempFile("testBAMWriter", ".bam"); - outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.BAM, samFileHeader); + RecordWriter rw = + bamOut.getRecordWriter(taskAttemptContext, outPath); - // now use the assembled output as m/r input - outputPath = doMapReduce(outFile.getAbsolutePath()); + final SamReader samReader = SamReaderFactory.makeDefault().open(new File(testBAMFileName)); - // merge the parts again - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.BAM, samFileHeader); + for (final SAMRecord r : samReader) { + final SAMRecordWritable samRW = new SAMRecordWritable(); + samRW.set(r); + rw.write(null, samRW); + } + samReader.close(); + rw.close(taskAttemptContext); + + // now verify the output + final int actualCount = getBAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testBAMOutput() throws Exception { + final Path outputPath = doMapReduce(testBAMFileName); + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.BAM, samFileHeader); + final int actualCount = getBAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testEmptyBAM() throws Exception { + String bam = BAMTestUtil.writeBamFile(0, SAMFileHeader.SortOrder.coordinate).toURI().toString(); + conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, true); + final Path outputPath = doMapReduce(bam); + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), + outFile.toURI().toString(), + SAMFormat.BAM, + new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate).getHeader()); + final int actualCount = getBAMRecordCount(outFile); + assertEquals(0, actualCount); + } + + @Test + public void testBAMWithSplittingBai() throws Exception { + int numPairs = 20000; + // create a large BAM with lots of index points + String bam = + BAMTestUtil.writeBamFile(20000, SAMFileHeader.SortOrder.coordinate).toURI().toString(); + conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 800000); // force multiple parts + conf.setBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, true); + final Path outputPath = doMapReduce(bam); + + List recordsAtSplits = new ArrayList<>(); + File[] splittingIndexes = + new File(outputPath.toUri()) + .listFiles( + pathname -> { + return pathname.getName().endsWith(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + }); + Arrays.sort(splittingIndexes); // ensure files are sorted by name + for (File file : splittingIndexes) { + File bamFile = + new File( + file.getParentFile(), + file.getName().replace(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION, "")); + SplittingBAMIndex index = new SplittingBAMIndex(file); + recordsAtSplits.addAll(getRecordsAtSplits(bamFile, index)); + } - // verify the final output - final int actualCount = getBAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + // outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), + outFile.toURI().toString(), + SAMFormat.BAM, + new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate).getHeader()); + + final int actualCount = getBAMRecordCount(outFile); + assertEquals(numPairs * 2 + 2, actualCount); // 2 unmapped reads + + File splittingBai = + new File( + outFile.getParentFile(), outFile.getName() + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); + SplittingBAMIndex splittingBAMIndex = new SplittingBAMIndex(splittingBai); + + assertEquals(recordsAtSplits, getRecordsAtSplits(outFile, splittingBAMIndex)); + } + + private List getRecordsAtSplits(File bam, SplittingBAMIndex index) throws IOException { + List records = new ArrayList<>(); + BAMRecordCodec codec = new BAMRecordCodec(samFileHeader); + BlockCompressedInputStream bci = new BlockCompressedInputStream(bam); + codec.setInputStream(bci); + for (Long offset : index.getVirtualOffsets()) { + bci.seek(offset); + SAMRecord record = codec.decode(); + if (record != null) { + records.add(record); + } + } + return records; + } + + @Test + public void testBAMRoundTrip() throws Exception { + // run a m/r job to write out a bam file + Path outputPath = doMapReduce(testBAMFileName); + + // merge the parts, and write to a temp file + final File outFile = File.createTempFile("testBAMWriter", ".bam"); + outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.BAM, samFileHeader); + + // now use the assembled output as m/r input + outputPath = doMapReduce(outFile.getAbsolutePath()); + + // merge the parts again + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.BAM, samFileHeader); + + // verify the final output + final int actualCount = getBAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + private Path doMapReduce(final String inputFile) throws Exception { + final FileSystem fileSystem = FileSystem.get(conf); + final Path inputPath = new Path(inputFile); + final Path outputPath = fileSystem.makeQualified(new Path("target/out")); + fileSystem.delete(outputPath, true); + + final Job job = Job.getInstance(conf); + FileInputFormat.setInputPaths(job, inputPath); + + conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); + job.setInputFormatClass(BAMInputFormat.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(SAMRecordWritable.class); + + job.setOutputFormatClass(BAMTestNoHeaderOutputFormat.class); + job.setOutputKeyClass(LongWritable.class); + job.setOutputValueClass(SAMRecordWritable.class); + + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, outputPath); + + final boolean success = job.waitForCompletion(true); + assertTrue(success); + + return outputPath; + } + + private int getBAMRecordCount(final File bamFile) throws IOException { + final SamReader bamReader = SamReaderFactory.makeDefault().open(SamInputResource.of(bamFile)); + final Iterator it = bamReader.iterator(); + int recCount = 0; + while (it.hasNext()) { + it.next(); + recCount++; + } + bamReader.close(); + return recCount; + } + + private int getBAMRecordCount(final File blockStreamFile, final SAMFileHeader header) + throws IOException { + // assemble a proper BAM file from the block stream shard(s) in + // order to verify the contents + final ByteArrayInputStream mergedStream = mergeBAMBlockStream(blockStreamFile, header); + + // now we can verify that we can read everything back in + final SamReader resultBAMReader = + SamReaderFactory.makeDefault().open(SamInputResource.of(mergedStream)); + final Iterator it = resultBAMReader.iterator(); + int actualCount = 0; + while (it.hasNext()) { + it.next(); + actualCount++; } + return actualCount; + } - private Path doMapReduce(final String inputFile) throws Exception { - final FileSystem fileSystem = FileSystem.get(conf); - final Path inputPath = new Path(inputFile); - final Path outputPath = fileSystem.makeQualified(new Path("target/out")); - fileSystem.delete(outputPath, true); + private ByteArrayInputStream mergeBAMBlockStream( + final File blockStreamFile, final SAMFileHeader header) throws IOException { + // assemble a proper BAM file from the block stream shard(s) in + // order to verify the contents + final ByteArrayOutputStream bamOutputStream = new ByteArrayOutputStream(); - final Job job = Job.getInstance(conf); - FileInputFormat.setInputPaths(job, inputPath); + // write out the bam file header + new SAMOutputPreparer().prepareForRecords(bamOutputStream, SAMFormat.BAM, header); - conf.set(BAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); - job.setInputFormatClass(BAMInputFormat.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(SAMRecordWritable.class); + // copy the contents of the block shard(s) written out by the M/R job + final ByteArrayOutputStream blockOutputStream = new ByteArrayOutputStream(); + Files.copy(blockStreamFile.toPath(), blockOutputStream); + blockOutputStream.writeTo(bamOutputStream); - job.setOutputFormatClass(BAMTestNoHeaderOutputFormat.class); - job.setOutputKeyClass(LongWritable.class); - job.setOutputValueClass(SAMRecordWritable.class); + // add the BGZF terminator + bamOutputStream.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); + bamOutputStream.close(); - job.setNumReduceTasks(0); - FileOutputFormat.setOutputPath(job, outputPath); + return new ByteArrayInputStream(bamOutputStream.toByteArray()); + } - final boolean success = job.waitForCompletion(true); - assertTrue(success); + // BAM output class that writes a header before records + static class BAMTestWithHeaderOutputFormat extends KeyIgnoringBAMOutputFormat { - return outputPath; - } + public static final String READ_HEADER_FROM_FILE = "TestBAM.header"; - private int getBAMRecordCount(final File bamFile) throws IOException { - final SamReader bamReader = SamReaderFactory.makeDefault() - .open(SamInputResource.of(bamFile)); - final Iterator it = bamReader.iterator(); - int recCount = 0; - while (it.hasNext()) { - it.next(); - recCount++; - } - bamReader.close(); - return recCount; + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx, Path outputPath) throws IOException { + readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); + setWriteHeader(true); + return super.getRecordWriter(ctx, outputPath); } + } - private int getBAMRecordCount( - final File blockStreamFile, - final SAMFileHeader header) throws IOException - { - // assemble a proper BAM file from the block stream shard(s) in - // order to verify the contents - final ByteArrayInputStream mergedStream = mergeBAMBlockStream ( - blockStreamFile, - header - ); - - // now we can verify that we can read everything back in - final SamReader resultBAMReader = SamReaderFactory.makeDefault() - .open(SamInputResource.of(mergedStream)); - final Iterator it = resultBAMReader.iterator(); - int actualCount = 0; - while (it.hasNext()) { - it.next(); - actualCount++; - } - return actualCount; - } + // BAM output class that doesn't write a header before records + static class BAMTestNoHeaderOutputFormat extends KeyIgnoringBAMOutputFormat { + + public static final String READ_HEADER_FROM_FILE = "TestBAM.header"; - private ByteArrayInputStream mergeBAMBlockStream( - final File blockStreamFile, - final SAMFileHeader header) throws IOException - { - // assemble a proper BAM file from the block stream shard(s) in - // order to verify the contents - final ByteArrayOutputStream bamOutputStream = new ByteArrayOutputStream(); - - // write out the bam file header - new SAMOutputPreparer().prepareForRecords( - bamOutputStream, - SAMFormat.BAM, - header); - - // copy the contents of the block shard(s) written out by the M/R job - final ByteArrayOutputStream blockOutputStream = new ByteArrayOutputStream(); - Files.copy(blockStreamFile.toPath(), blockOutputStream); - blockOutputStream.writeTo(bamOutputStream); - - // add the BGZF terminator - bamOutputStream.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); - bamOutputStream.close(); - - return new ByteArrayInputStream(bamOutputStream.toByteArray()); + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx, Path outputPath) throws IOException { + // the writers require a header in order to create a codec, even if + // the header isn't being written out + readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); + setWriteHeader(false); + return super.getRecordWriter(ctx, outputPath); } + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java b/src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java index 6becbad..e077542 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestBAMSplitGuesser.java @@ -1,5 +1,7 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; + import htsjdk.samtools.SAMUtils; import htsjdk.samtools.seekablestream.SeekableStream; import java.io.File; @@ -8,8 +10,6 @@ import org.junit.Test; import org.seqdoop.hadoop_bam.util.WrapSeekable; -import static org.junit.Assert.assertEquals; - public class TestBAMSplitGuesser { @Test diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java b/src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java index d500cdd..f168d05 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestBGZFSplitGuesser.java @@ -1,5 +1,7 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; + import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.BlockCompressedStreamConstants; import java.io.File; @@ -15,8 +17,6 @@ import org.junit.runners.Parameterized; import org.seqdoop.hadoop_bam.util.BGZFSplitGuesser; -import static org.junit.Assert.assertEquals; - @RunWith(Parameterized.class) public class TestBGZFSplitGuesser { @@ -32,9 +32,10 @@ public TestBGZFSplitGuesser(String filename, long firstSplit, long lastSplit) { @Parameterized.Parameters public static Collection data() { - return Arrays.asList(new Object[][] { - {"test.vcf.bgzf.gz", 821, 821}, {"HiSeq.10000.vcf.bgzf.gz", 16688, 509222} - }); + return Arrays.asList( + new Object[][] { + {"test.vcf.bgzf.gz", 821, 821}, {"HiSeq.10000.vcf.bgzf.gz", 16688, 509222} + }); } @Test @@ -58,14 +59,14 @@ public void test() throws IOException { assertEquals(firstSplit, (long) boundaries.getFirst()); assertEquals(lastSplit, (long) boundaries.getLast()); - assertEquals("Last block start is terminator gzip block", + assertEquals( + "Last block start is terminator gzip block", file.length() - BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length, (long) boundaries.get(boundaries.size() - 1)); } private void canReadFromBlockStart(long blockStart) throws IOException { - BlockCompressedInputStream blockCompressedInputStream = new - BlockCompressedInputStream(file); + BlockCompressedInputStream blockCompressedInputStream = new BlockCompressedInputStream(file); blockCompressedInputStream.setCheckCrcs(true); blockCompressedInputStream.seek(blockStart << 16); byte[] b = new byte[100]; diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormat.java index d0ec14e..0564575 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormat.java @@ -1,5 +1,9 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; @@ -28,11 +32,8 @@ import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestCRAMInputFormat { + private String input; private String reference; private TaskAttemptContext taskAttemptContext; @@ -53,8 +54,10 @@ public void setup() throws Exception { @Test public void testReader() throws Exception { int expectedCount = 0; - SamReader samReader = SamReaderFactory.makeDefault() - .referenceSequence(new File(URI.create(reference))).open(new File(input)); + SamReader samReader = + SamReaderFactory.makeDefault() + .referenceSequence(new File(URI.create(reference))) + .open(new File(input)); for (SAMRecord r : samReader) { expectedCount++; } @@ -62,8 +65,8 @@ public void testReader() throws Exception { AnySAMInputFormat inputFormat = new AnySAMInputFormat(); List splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); - RecordReader reader = inputFormat - .createRecordReader(splits.get(0), taskAttemptContext); + RecordReader reader = + inputFormat.createRecordReader(splits.get(0), taskAttemptContext); reader.initialize(splits.get(0), taskAttemptContext); int actualCount = 0; @@ -135,8 +138,10 @@ public void testMapReduceJob() throws Exception { assertTrue(success); List samStrings = new ArrayList(); - SamReader samReader = SamReaderFactory.makeDefault() - .referenceSequence(new File(URI.create(reference))).open(new File(input)); + SamReader samReader = + SamReaderFactory.makeDefault() + .referenceSequence(new File(URI.create(reference))) + .open(new File(input)); for (SAMRecord r : samReader) { samStrings.add(r.getSAMString().trim()); } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java index 034c078..1efce70 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMInputFormatOnHDFS.java @@ -1,5 +1,8 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; @@ -26,20 +29,15 @@ import org.junit.BeforeClass; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestCRAMInputFormatOnHDFS { + + private static MiniDFSCluster cluster; + private static URI clusterUri; private String input; private String reference; private TaskAttemptContext taskAttemptContext; private JobContext jobContext; - - private static MiniDFSCluster cluster; - private static URI clusterUri; - @BeforeClass public static void setUpBeforeClass() throws Exception { cluster = startMini(TestCRAMInputFormatOnHDFS.class.getName()); @@ -48,20 +46,52 @@ public static void setUpBeforeClass() throws Exception { @AfterClass public static void teardownClass() throws Exception { - if (cluster != null) - { + if (cluster != null) { cluster.shutdown(); } } + private static MiniDFSCluster startMini(String testName) throws IOException { + File baseDir = new File("./target/hdfs/" + testName).getAbsoluteFile(); + FileUtil.fullyDelete(baseDir); + Configuration conf = new Configuration(); + conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); + MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); + MiniDFSCluster hdfsCluster = builder.clusterId(testName).build(); + hdfsCluster.waitActive(); + return hdfsCluster; + } + + protected static URI formalizeClusterURI(URI clusterUri) throws URISyntaxException { + if (clusterUri.getPath() == null) { + return new URI( + clusterUri.getScheme(), + null, + clusterUri.getHost(), + clusterUri.getPort(), + "/", + null, + null); + } else if (clusterUri.getPath().trim() == "") { + return new URI( + clusterUri.getScheme(), + null, + clusterUri.getHost(), + clusterUri.getPort(), + "/", + null, + null); + } + return clusterUri; + } @Before public void setup() throws Exception { Configuration conf = new Configuration(); input = ClassLoader.getSystemClassLoader().getResource("test.cram").getFile(); reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI().toString(); - String referenceIndex = ClassLoader.getSystemClassLoader().getResource("auxf.fa.fai") - .toURI().toString(); + String referenceIndex = + ClassLoader.getSystemClassLoader().getResource("auxf.fa.fai").toURI().toString(); conf.set("mapred.input.dir", "file://" + input); URI hdfsRef = clusterUri.resolve("/tmp/auxf.fa"); @@ -71,41 +101,17 @@ public void setup() throws Exception { conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, hdfsRef.toString()); - taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); jobContext = new JobContextImpl(conf, taskAttemptContext.getJobID()); - - } - - private static MiniDFSCluster startMini(String testName) throws IOException { - File baseDir = new File("./target/hdfs/" + testName).getAbsoluteFile(); - FileUtil.fullyDelete(baseDir); - Configuration conf = new Configuration(); - conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); - MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf); - MiniDFSCluster hdfsCluster = builder.clusterId(testName).build(); - hdfsCluster.waitActive(); - return hdfsCluster; - } - - protected static URI formalizeClusterURI(URI clusterUri) throws URISyntaxException { - if (clusterUri.getPath()==null) { - return new URI(clusterUri.getScheme(), null, - clusterUri.getHost(), clusterUri.getPort(), - "/", null, null); - } else if (clusterUri.getPath().trim()=="") { - return new URI(clusterUri.getScheme(), null, - clusterUri.getHost(), clusterUri.getPort(), - "/", null, null); - } - return clusterUri; } @Test public void testReader() throws Exception { int expectedCount = 0; - SamReader samReader = SamReaderFactory.makeDefault() - .referenceSequence(new File(URI.create(reference))).open(new File(input)); + SamReader samReader = + SamReaderFactory.makeDefault() + .referenceSequence(new File(URI.create(reference))) + .open(new File(input)); for (SAMRecord r : samReader) { expectedCount++; } @@ -113,8 +119,8 @@ public void testReader() throws Exception { CRAMInputFormat inputFormat = new CRAMInputFormat(); List splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); - RecordReader reader = inputFormat - .createRecordReader(splits.get(0), taskAttemptContext); + RecordReader reader = + inputFormat.createRecordReader(splits.get(0), taskAttemptContext); reader.initialize(splits.get(0), taskAttemptContext); int actualCount = 0; @@ -124,5 +130,4 @@ public void testReader() throws Exception { assertEquals(expectedCount, actualCount); } - } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMOutputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMOutputFormat.java index 171013b..6d12a36 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestCRAMOutputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestCRAMOutputFormat.java @@ -1,9 +1,16 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.*; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.seekablestream.SeekableStream; +import java.io.*; import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -17,285 +24,254 @@ import org.junit.Test; import org.seqdoop.hadoop_bam.util.SAMFileMerger; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; - -import java.io.*; -import java.nio.file.Paths; -import java.util.Iterator; import org.seqdoop.hadoop_bam.util.SAMOutputPreparer; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestCRAMOutputFormat { - private String testCRAMFileName; - private String testReferenceFileName; - private ReferenceSource testReferenceSource; - - private int expectedRecordCount; - private SAMFileHeader samFileHeader; - - private TaskAttemptContext taskAttemptContext; - private static Configuration conf; - - // CRAM output class that writes a header before records - static class CRAMTestWithHeaderOutputFormat - extends KeyIgnoringCRAMOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestBAM.header"; - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, - Path outputPath) throws IOException { - readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); - setWriteHeader(true); - return super.getRecordWriter(ctx, outputPath); - } - } - - // CRAM Output class that doesn't write a header out before records - static class CRAMTestNoHeaderOutputFormat - extends KeyIgnoringCRAMOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestBAM.header"; - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx, - Path outputPath) throws IOException { - // the writers require a header in order to create a codec, even if - // the header isn't being written out - readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); - setWriteHeader(false); - return super.getRecordWriter(ctx, outputPath); - } - } - - @Before - public void setup() throws Exception { - conf = new Configuration(); - - testCRAMFileName = ClassLoader.getSystemClassLoader() - .getResource("test.cram").getFile(); - testReferenceFileName = ClassLoader.getSystemClassLoader() - .getResource("auxf.fa").getFile(); - testReferenceSource = new ReferenceSource(Paths.get(testReferenceFileName)); - - conf.set("mapred.input.dir", "file://" + testCRAMFileName); - conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, - "file://" + testReferenceFileName); - - // fetch the SAMFile header from the original input to get the - // expected count - expectedRecordCount = getCRAMRecordCount(new File(testCRAMFileName)); - samFileHeader = SAMHeaderReader.readSAMHeaderFrom( - new Path(testCRAMFileName), conf); - - taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); - } - - @Test - public void testCRAMRecordWriterNoHeader() throws Exception { - final File outFile = File.createTempFile("testCRAMWriter", ".cram"); - outFile.deleteOnExit(); - final Path outPath = new Path(outFile.toURI()); - - final CRAMTestNoHeaderOutputFormat cramOut = new CRAMTestNoHeaderOutputFormat(); - conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testCRAMFileName); - - RecordWriter rw = - cramOut.getRecordWriter(taskAttemptContext, outPath); - - final SamReader samReader = SamReaderFactory.makeDefault() - .referenceSequence(new File(testReferenceFileName)) - .open(new File(testCRAMFileName)); - - for (final SAMRecord r : samReader) { - final SAMRecordWritable samRW = new SAMRecordWritable(); - samRW.set(r); - rw.write(null, samRW); - } - samReader.close(); - rw.close(taskAttemptContext); - - // now verify the container stream - final int actualCount = getCRAMRecordCount(outFile, samFileHeader, - testReferenceSource); - assertEquals(expectedRecordCount, actualCount); - } - @Test - public void testCRAMRecordWriterWithHeader() throws Exception { - final File outFile = File.createTempFile("testCRAMWriter", ".cram"); - outFile.deleteOnExit(); - final Path outPath = new Path(outFile.toURI()); - - final CRAMTestWithHeaderOutputFormat cramOut = new CRAMTestWithHeaderOutputFormat(); - conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testCRAMFileName); - - RecordWriter rw = - cramOut.getRecordWriter(taskAttemptContext, outPath); - - final SamReader samReader = SamReaderFactory.makeDefault() - .referenceSequence(new File(testReferenceFileName)) - .open(new File(testCRAMFileName)); - - for (final SAMRecord r : samReader) { - final SAMRecordWritable samRW = new SAMRecordWritable(); - samRW.set(r); - rw.write(null, samRW); - } - samReader.close(); - rw.close(taskAttemptContext); - - // now verify the container stream - final int actualCount = getCRAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); + private static Configuration conf; + private String testCRAMFileName; + private String testReferenceFileName; + private ReferenceSource testReferenceSource; + private int expectedRecordCount; + private SAMFileHeader samFileHeader; + private TaskAttemptContext taskAttemptContext; + + @Before + public void setup() throws Exception { + conf = new Configuration(); + + testCRAMFileName = ClassLoader.getSystemClassLoader().getResource("test.cram").getFile(); + testReferenceFileName = ClassLoader.getSystemClassLoader().getResource("auxf.fa").getFile(); + testReferenceSource = new ReferenceSource(Paths.get(testReferenceFileName)); + + conf.set("mapred.input.dir", "file://" + testCRAMFileName); + conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, "file://" + testReferenceFileName); + + // fetch the SAMFile header from the original input to get the + // expected count + expectedRecordCount = getCRAMRecordCount(new File(testCRAMFileName)); + samFileHeader = SAMHeaderReader.readSAMHeaderFrom(new Path(testCRAMFileName), conf); + + taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); + } + + @Test + public void testCRAMRecordWriterNoHeader() throws Exception { + final File outFile = File.createTempFile("testCRAMWriter", ".cram"); + outFile.deleteOnExit(); + final Path outPath = new Path(outFile.toURI()); + + final CRAMTestNoHeaderOutputFormat cramOut = new CRAMTestNoHeaderOutputFormat(); + conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testCRAMFileName); + + RecordWriter rw = + cramOut.getRecordWriter(taskAttemptContext, outPath); + + final SamReader samReader = + SamReaderFactory.makeDefault() + .referenceSequence(new File(testReferenceFileName)) + .open(new File(testCRAMFileName)); + + for (final SAMRecord r : samReader) { + final SAMRecordWritable samRW = new SAMRecordWritable(); + samRW.set(r); + rw.write(null, samRW); } - - @Test - public void testCRAMOutput() throws Exception { - final Path outputPath = doMapReduce(testCRAMFileName); - final File outFile = File.createTempFile("testCRAMWriter", ".cram"); - outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.CRAM, samFileHeader); - final File containerStreamFile = - new File(new File(outputPath.toUri()), "part-m-00000"); - final int actualCount = getCRAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); + samReader.close(); + rw.close(taskAttemptContext); + + // now verify the container stream + final int actualCount = getCRAMRecordCount(outFile, samFileHeader, testReferenceSource); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testCRAMRecordWriterWithHeader() throws Exception { + final File outFile = File.createTempFile("testCRAMWriter", ".cram"); + outFile.deleteOnExit(); + final Path outPath = new Path(outFile.toURI()); + + final CRAMTestWithHeaderOutputFormat cramOut = new CRAMTestWithHeaderOutputFormat(); + conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, testCRAMFileName); + + RecordWriter rw = + cramOut.getRecordWriter(taskAttemptContext, outPath); + + final SamReader samReader = + SamReaderFactory.makeDefault() + .referenceSequence(new File(testReferenceFileName)) + .open(new File(testCRAMFileName)); + + for (final SAMRecord r : samReader) { + final SAMRecordWritable samRW = new SAMRecordWritable(); + samRW.set(r); + rw.write(null, samRW); } - - @Test - public void testCRAMRoundTrip() throws Exception { - // run a m/r job to write out a cram file - Path outputPath = doMapReduce(testCRAMFileName); - - // merge the parts, and write to a temp file - final File outFile = File.createTempFile("testCRAMWriter", ".cram"); - outFile.deleteOnExit(); - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.CRAM, samFileHeader); - - // now use the assembled output as m/r input - outputPath = doMapReduce(outFile.getAbsolutePath()); - - // merge the parts again - SAMFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - SAMFormat.CRAM, samFileHeader); - - // verify the final output - final int actualCount = getCRAMRecordCount(outFile); - assertEquals(expectedRecordCount, actualCount); + samReader.close(); + rw.close(taskAttemptContext); + + // now verify the container stream + final int actualCount = getCRAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testCRAMOutput() throws Exception { + final Path outputPath = doMapReduce(testCRAMFileName); + final File outFile = File.createTempFile("testCRAMWriter", ".cram"); + outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.CRAM, samFileHeader); + final File containerStreamFile = new File(new File(outputPath.toUri()), "part-m-00000"); + final int actualCount = getCRAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + @Test + public void testCRAMRoundTrip() throws Exception { + // run a m/r job to write out a cram file + Path outputPath = doMapReduce(testCRAMFileName); + + // merge the parts, and write to a temp file + final File outFile = File.createTempFile("testCRAMWriter", ".cram"); + outFile.deleteOnExit(); + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.CRAM, samFileHeader); + + // now use the assembled output as m/r input + outputPath = doMapReduce(outFile.getAbsolutePath()); + + // merge the parts again + SAMFileMerger.mergeParts( + outputPath.toUri().toString(), outFile.toURI().toString(), SAMFormat.CRAM, samFileHeader); + + // verify the final output + final int actualCount = getCRAMRecordCount(outFile); + assertEquals(expectedRecordCount, actualCount); + } + + private Path doMapReduce(final String inputFile) throws Exception { + final FileSystem fileSystem = FileSystem.get(conf); + final Path inputPath = new Path(inputFile); + final Path outputPath = fileSystem.makeQualified(new Path("target/out")); + fileSystem.delete(outputPath, true); + + final Job job = Job.getInstance(conf); + FileInputFormat.setInputPaths(job, inputPath); + + job.setInputFormatClass(CRAMInputFormat.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(SAMRecordWritable.class); + + conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); + job.setOutputFormatClass(CRAMTestNoHeaderOutputFormat.class); + job.setOutputKeyClass(LongWritable.class); + job.setOutputValueClass(SAMRecordWritable.class); + + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, outputPath); + + final boolean success = job.waitForCompletion(true); + assertTrue(success); + + return outputPath; + } + + private int getCRAMRecordCount(final File cramFile) { + final CRAMFileReader cramReader = + new CRAMFileReader(cramFile, (File) null, testReferenceSource); + final Iterator it = cramReader.getIterator(); + int recCount = 0; + while (it.hasNext()) { + it.next(); + recCount++; } - - private Path doMapReduce(final String inputFile) throws Exception { - final FileSystem fileSystem = FileSystem.get(conf); - final Path inputPath = new Path(inputFile); - final Path outputPath = fileSystem.makeQualified(new Path("target/out")); - fileSystem.delete(outputPath, true); - - final Job job = Job.getInstance(conf); - FileInputFormat.setInputPaths(job, inputPath); - - job.setInputFormatClass(CRAMInputFormat.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(SAMRecordWritable.class); - - conf.set(CRAMTestNoHeaderOutputFormat.READ_HEADER_FROM_FILE, inputFile); - job.setOutputFormatClass(CRAMTestNoHeaderOutputFormat.class); - job.setOutputKeyClass(LongWritable.class); - job.setOutputValueClass(SAMRecordWritable.class); - - job.setNumReduceTasks(0); - FileOutputFormat.setOutputPath(job, outputPath); - - final boolean success = job.waitForCompletion(true); - assertTrue(success); - - return outputPath; - } - - private int getCRAMRecordCount(final File cramFile) { - final CRAMFileReader cramReader = - new CRAMFileReader(cramFile, - (File)null, - testReferenceSource); - final Iterator it = cramReader.getIterator(); - int recCount = 0; - while (it.hasNext()) { - it.next(); - recCount++; - } - cramReader.close(); - return recCount; - } - - private int getCRAMRecordCount( - final File containerStreamFile, - final SAMFileHeader header, - final ReferenceSource refSource) throws IOException - { - // assemble a proper CRAM file from the container stream shard(s) in - // order to verify the contents - final ByteArrayInputStream mergedStream = mergeCRAMContainerStream ( - containerStreamFile, - header, - refSource - ); - - // now we can verify that we can read everything back in - final CRAMFileReader resultCRAMReader = new CRAMFileReader( + cramReader.close(); + return recCount; + } + + private int getCRAMRecordCount( + final File containerStreamFile, final SAMFileHeader header, final ReferenceSource refSource) + throws IOException { + // assemble a proper CRAM file from the container stream shard(s) in + // order to verify the contents + final ByteArrayInputStream mergedStream = + mergeCRAMContainerStream(containerStreamFile, header, refSource); + + // now we can verify that we can read everything back in + final CRAMFileReader resultCRAMReader = + new CRAMFileReader( mergedStream, (SeekableStream) null, refSource, ValidationStringency.DEFAULT_STRINGENCY); - final Iterator it = resultCRAMReader.getIterator(); - int actualCount = 0; - while (it.hasNext()) { - it.next(); - actualCount++; - } - return actualCount; + final Iterator it = resultCRAMReader.getIterator(); + int actualCount = 0; + while (it.hasNext()) { + it.next(); + actualCount++; + } + return actualCount; + } + + // TODO: SAMOutputPreparer knows how to prepare the beginning of a stream, + // but not how to populate or terminate it (which for CRAM requires a special + // terminating EOF container). For now we'll use SAMPreparer here so we get + // some test coverage, and then manually populate and terminate, but we + // should consolidate/refactor the knowledge of how to do this aggregation + // for each output type in one place in a separate PR + // https://github.com/HadoopGenomics/Hadoop-BAM/issues/61 + private ByteArrayInputStream mergeCRAMContainerStream( + final File containerStreamFile, final SAMFileHeader header, final ReferenceSource refSource) + throws IOException { + // assemble a proper CRAM file from the container stream shard(s) in + // order to verify the contents + final ByteArrayOutputStream cramOutputStream = new ByteArrayOutputStream(); + // write out the cram file header + new SAMOutputPreparer().prepareForRecords(cramOutputStream, SAMFormat.CRAM, header); + // now copy the contents of the container stream shard(s) written out by + // the M/R job + final ByteArrayOutputStream containerOutputStream = new ByteArrayOutputStream(); + Files.copy(containerStreamFile.toPath(), containerOutputStream); + containerOutputStream.writeTo(cramOutputStream); + + // use containerStreamWriter directly to properly terminate the output + // stream with an EOF container + final CRAMContainerStreamWriter containerStreamWriter = + new CRAMContainerStreamWriter(cramOutputStream, null, refSource, header, "CRAMTest"); + containerStreamWriter.finish(true); // close and write an EOF container + cramOutputStream.close(); + + return new ByteArrayInputStream(cramOutputStream.toByteArray()); + } + + // CRAM output class that writes a header before records + static class CRAMTestWithHeaderOutputFormat extends KeyIgnoringCRAMOutputFormat { + + public static final String READ_HEADER_FROM_FILE = "TestBAM.header"; + + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx, Path outputPath) throws IOException { + readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); + setWriteHeader(true); + return super.getRecordWriter(ctx, outputPath); } + } + + // CRAM Output class that doesn't write a header out before records + static class CRAMTestNoHeaderOutputFormat extends KeyIgnoringCRAMOutputFormat { + + public static final String READ_HEADER_FROM_FILE = "TestBAM.header"; - // TODO: SAMOutputPreparer knows how to prepare the beginning of a stream, - // but not how to populate or terminate it (which for CRAM requires a special - // terminating EOF container). For now we'll use SAMPreparer here so we get - // some test coverage, and then manually populate and terminate, but we - // should consolidate/refactor the knowledge of how to do this aggregation - // for each output type in one place in a separate PR - // https://github.com/HadoopGenomics/Hadoop-BAM/issues/61 - private ByteArrayInputStream mergeCRAMContainerStream( - final File containerStreamFile, - final SAMFileHeader header, - final ReferenceSource refSource) throws IOException - { - // assemble a proper CRAM file from the container stream shard(s) in - // order to verify the contents - final ByteArrayOutputStream cramOutputStream = new ByteArrayOutputStream(); - // write out the cram file header - new SAMOutputPreparer().prepareForRecords( - cramOutputStream, - SAMFormat.CRAM, - header); - // now copy the contents of the container stream shard(s) written out by - // the M/R job - final ByteArrayOutputStream containerOutputStream = new ByteArrayOutputStream(); - Files.copy(containerStreamFile.toPath(), containerOutputStream); - containerOutputStream.writeTo(cramOutputStream); - - // use containerStreamWriter directly to properly terminate the output - // stream with an EOF container - final CRAMContainerStreamWriter containerStreamWriter = - new CRAMContainerStreamWriter( - cramOutputStream, - null, - refSource, - header, - "CRAMTest"); - containerStreamWriter.finish(true); // close and write an EOF container - cramOutputStream.close(); - - return new ByteArrayInputStream(cramOutputStream.toByteArray()); + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx, Path outputPath) throws IOException { + // the writers require a header in order to create a codec, even if + // the header isn't being written out + readSAMHeaderFrom(new Path(conf.get(READ_HEADER_FROM_FILE)), conf); + setWriteHeader(false); + return super.getRecordWriter(ctx, outputPath); } + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java b/src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java index 1866943..e0d2beb 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestConfHelper.java @@ -22,69 +22,62 @@ package org.seqdoop.hadoop_bam; -import org.seqdoop.hadoop_bam.util.ConfHelper; - -import org.junit.*; import static org.junit.Assert.*; import org.apache.hadoop.conf.Configuration; +import org.junit.*; +import org.seqdoop.hadoop_bam.util.ConfHelper; -public class TestConfHelper -{ - @Test - public void testParseBooleanValidValues() - { - assertTrue(ConfHelper.parseBoolean("true", false)); - assertTrue(ConfHelper.parseBoolean("tRuE", false)); - assertTrue(ConfHelper.parseBoolean("TRUE", false)); - assertTrue(ConfHelper.parseBoolean("t", false)); - assertTrue(ConfHelper.parseBoolean("yes", false)); - assertTrue(ConfHelper.parseBoolean("y", false)); - assertTrue(ConfHelper.parseBoolean("Y", false)); - assertTrue(ConfHelper.parseBoolean("1", false)); +public class TestConfHelper { - assertFalse(ConfHelper.parseBoolean("false", true)); - assertFalse(ConfHelper.parseBoolean("faLse", true)); - assertFalse(ConfHelper.parseBoolean("FALSE", true)); - assertFalse(ConfHelper.parseBoolean("f", true)); - assertFalse(ConfHelper.parseBoolean("no", true)); - assertFalse(ConfHelper.parseBoolean("n", true)); - assertFalse(ConfHelper.parseBoolean("N", true)); - assertFalse(ConfHelper.parseBoolean("0", true)); - } + public static void main(String args[]) { + org.junit.runner.JUnitCore.main(TestConfHelper.class.getName()); + } - @Test - public void testParseBooleanNull() - { - assertTrue(ConfHelper.parseBoolean(null, true)); - assertFalse(ConfHelper.parseBoolean(null, false)); - } + @Test + public void testParseBooleanValidValues() { + assertTrue(ConfHelper.parseBoolean("true", false)); + assertTrue(ConfHelper.parseBoolean("tRuE", false)); + assertTrue(ConfHelper.parseBoolean("TRUE", false)); + assertTrue(ConfHelper.parseBoolean("t", false)); + assertTrue(ConfHelper.parseBoolean("yes", false)); + assertTrue(ConfHelper.parseBoolean("y", false)); + assertTrue(ConfHelper.parseBoolean("Y", false)); + assertTrue(ConfHelper.parseBoolean("1", false)); - @Test(expected=IllegalArgumentException.class) - public void testParseBooleanInvalidValue() - { - ConfHelper.parseBoolean("dodo", true); - } + assertFalse(ConfHelper.parseBoolean("false", true)); + assertFalse(ConfHelper.parseBoolean("faLse", true)); + assertFalse(ConfHelper.parseBoolean("FALSE", true)); + assertFalse(ConfHelper.parseBoolean("f", true)); + assertFalse(ConfHelper.parseBoolean("no", true)); + assertFalse(ConfHelper.parseBoolean("n", true)); + assertFalse(ConfHelper.parseBoolean("N", true)); + assertFalse(ConfHelper.parseBoolean("0", true)); + } - @Test - public void testParseBooleanFromConfValue() - { - final String propName = "my.property"; - Configuration conf = new Configuration(); - conf.set(propName, "t"); - assertTrue(ConfHelper.parseBoolean(conf, propName, false)); - } + @Test + public void testParseBooleanNull() { + assertTrue(ConfHelper.parseBoolean(null, true)); + assertFalse(ConfHelper.parseBoolean(null, false)); + } - @Test - public void testParseBooleanFromConfNull() - { - Configuration conf = new Configuration(); - assertTrue(ConfHelper.parseBoolean(conf, "my.property", true)); - assertFalse(ConfHelper.parseBoolean(conf, "my.property", false)); - } + @Test(expected = IllegalArgumentException.class) + public void testParseBooleanInvalidValue() { + ConfHelper.parseBoolean("dodo", true); + } + @Test + public void testParseBooleanFromConfValue() { + final String propName = "my.property"; + Configuration conf = new Configuration(); + conf.set(propName, "t"); + assertTrue(ConfHelper.parseBoolean(conf, propName, false)); + } - public static void main(String args[]) { - org.junit.runner.JUnitCore.main(TestConfHelper.class.getName()); - } + @Test + public void testParseBooleanFromConfNull() { + Configuration conf = new Configuration(); + assertTrue(ConfHelper.parseBoolean(conf, "my.property", true)); + assertFalse(ConfHelper.parseBoolean(conf, "my.property", false)); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java index b484b36..3d744f5 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestFastaInputFormat.java @@ -1,5 +1,10 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; @@ -14,12 +19,8 @@ import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestFastaInputFormat { + private String input; private TaskAttemptContext taskAttemptContext; private JobContext jobContext; @@ -42,29 +43,53 @@ public void testReader() throws Exception { FastaInputFormat inputFormat = new FastaInputFormat(); List splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); - RecordReader reader = inputFormat - .createRecordReader(splits.get(0), taskAttemptContext); + RecordReader reader = + inputFormat.createRecordReader(splits.get(0), taskAttemptContext); reader.initialize(splits.get(0), taskAttemptContext); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey()); - assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:11"), reader.getCurrentKey()); + assertEquals( + new Text( + "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"), + reader.getCurrentValue().getSequence()); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), reader.getCurrentKey()); - assertEquals(new Text("ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:182"), + reader.getCurrentKey()); + assertEquals( + new Text( + "ACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAAC"), + reader.getCurrentValue().getSequence()); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), reader.getCurrentKey()); - assertEquals(new Text("CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1163"), + reader.getCurrentKey()); + assertEquals( + new Text( + "CCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCC"), + reader.getCurrentValue().getSequence()); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), reader.getCurrentKey()); - assertEquals(new Text("TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1244"), + reader.getCurrentKey()); + assertEquals( + new Text( + "TAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCC"), + reader.getCurrentValue().getSequence()); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), reader.getCurrentKey()); - assertEquals(new Text("CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr1 dna:chromosome chromosome:GRCh37:1:1:249250621:1325"), + reader.getCurrentKey()); + assertEquals( + new Text( + "CAACCCTAACCCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCC"), + reader.getCurrentValue().getSequence()); assertFalse(reader.nextKeyValue()); @@ -72,12 +97,15 @@ public void testReader() throws Exception { reader.initialize(splits.get(1), taskAttemptContext); assertTrue(reader.nextKeyValue()); - assertEquals(new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey()); - assertEquals(new Text("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), reader.getCurrentValue().getSequence()); + assertEquals( + new Text("chr2 dna:chromosome chromosome:GRCh37:2:1:243199373:11"), reader.getCurrentKey()); + assertEquals( + new Text( + "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTC"), + reader.getCurrentValue().getSequence()); assertFalse(reader.nextKeyValue()); reader.close(); } - } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestFastqInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestFastqInputFormat.java index 2574495..d4d4f53 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestFastqInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestFastqInputFormat.java @@ -22,534 +22,560 @@ package org.seqdoop.hadoop_bam; -import org.seqdoop.hadoop_bam.FastqInputFormat.FastqRecordReader; +import static org.junit.Assert.*; +import java.io.BufferedOutputStream; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.FileOutputStream; -import java.io.BufferedOutputStream; - -import org.junit.*; -import static org.junit.Assert.*; - -import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.junit.*; +import org.seqdoop.hadoop_bam.FastqInputFormat.FastqRecordReader; -public class TestFastqInputFormat -{ - public static final String oneFastq = - "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################"; - - public static final String twoFastq = - "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################\n" + - - "@ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1\n" + - "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\n" + - "+\n" + - "BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################"; - - public static final String illuminaFastq = - "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "##########################################################################################~"; - - public static final String illuminaFastqWithPhred64Quality = - "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; - - public static final String oneFastqWithoutRead = - "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################"; - - public static final String fastqWithIdTwice = - "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "###########################################################################################"; - - public static final String fastqWithAmpersandQuality = - "+lousy.id HWI-ST168_161:1:1:1373:2042/1\n" + - "@##########################################################################################\n" + - "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + - "###########################################################################################"; - - public static final String illuminaFastqNoFlowCellID = - "@EAS139:136::2:5:1000:12850 1:Y:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################"; - - public static final String illuminaFastqNegativeXYPos = - "@EAS139:136:FC706VJ:2:5:-1000:-12850 1:Y:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################"; - - public static final String illuminaFastqNoIndex = - "@EAS139:136::2:5:1000:12850 1:Y:18:\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################"; - - public static final String twoFastqWithIllumina = - "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################\n" + - - "@EAS139:136:FC706VJ:2:5:1000:12850 2:N:18:ATCACG\n" + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + - "+\n" + - "###########################################################################################\n" + - - "@EAS139:136:FC706VJ:2:5:1000:12850 3:N:18:ATCACG\n" + - "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\n" + - "+\n" + - "BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################"; - - private JobConf conf; - private FileSplit split; - private File tempFastq; - private File tempGz; - - private Text key; - private SequencedFragment fragment; - - @Before - public void setup() throws IOException - { - tempFastq = File.createTempFile("test_fastq_input_format", "fastq"); - tempGz = File.createTempFile("test_fastq_input_format", ".gz"); - conf = new JobConf(); - key = new Text(); - fragment = new SequencedFragment(); - } - - @After - public void tearDown() - { - tempFastq.delete(); - tempGz.delete(); - split = null; - } - - private void writeToTempFastq(String s) throws IOException - { - PrintWriter fastqOut = new PrintWriter( new BufferedWriter( new FileWriter(tempFastq) ) ); - fastqOut.write(s); - fastqOut.close(); - } - - private FastqRecordReader createReaderForOneFastq() throws IOException - { - writeToTempFastq(oneFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, oneFastq.length(), null); - - return new FastqRecordReader(conf, split); - } - - @Test - public void testReadFromStart() throws IOException - { - FastqRecordReader reader = createReaderForOneFastq(); - - assertEquals(0, reader.getPos()); - assertEquals(0.0, reader.getProgress(), 0.01); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - assertEquals("###########################################################################################", fragment.getQuality().toString()); - - assertEquals(oneFastq.length(), reader.getPos()); - assertEquals(1.0, reader.getProgress(), 0.01); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testReadStartInMiddle() throws IOException - { - writeToTempFastq(twoFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq - assertEquals(0.0, reader.getProgress(), 0.01); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); - assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); - assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString()); - - assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data - assertEquals(1.0, reader.getProgress(), 0.01); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testSliceEndsBeforeEndOfFile() throws IOException - { - writeToTempFastq(twoFastq); - // slice ends at position 10--i.e. somewhere in the first record. The second record should not be read. - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, 10, null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); - - assertFalse("FastqRecordReader is reading a record that starts after the end of the slice", reader.next(key, fragment)); - } - - @Test - public void testGetReadNumFromName() throws IOException - { - FastqRecordReader reader = createReaderForOneFastq(); - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals(1, fragment.getRead().intValue()); - } - - @Test - public void testNameWithoutReadNum() throws IOException - { - writeToTempFastq(oneFastqWithoutRead); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, oneFastqWithoutRead.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertNull("Read is not null", fragment.getRead()); - } - - @Test - public void testIlluminaMetaInfo() throws IOException - { - writeToTempFastq(illuminaFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastq.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - - assertEquals("EAS139", fragment.getInstrument()); - assertEquals(136, fragment.getRunNumber().intValue()); - assertEquals("FC706VJ", fragment.getFlowcellId()); - assertEquals(2, fragment.getLane().intValue()); - assertEquals(5, fragment.getTile().intValue()); - assertEquals(1000, fragment.getXpos().intValue()); - assertEquals(12850, fragment.getYpos().intValue()); - assertEquals(1, fragment.getRead().intValue()); - assertEquals(false, fragment.getFilterPassed().booleanValue()); - assertEquals(18, fragment.getControlNumber().intValue()); - assertEquals("ATCACG", fragment.getIndexSequence()); - } - - @Test - public void testIlluminaMetaInfoNullFC() throws IOException - { - writeToTempFastq(illuminaFastqNoFlowCellID); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoFlowCellID.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - - assertEquals("EAS139", fragment.getInstrument()); - assertEquals(136, fragment.getRunNumber().intValue()); - assertEquals("", fragment.getFlowcellId()); - assertEquals(2, fragment.getLane().intValue()); - assertEquals(5, fragment.getTile().intValue()); - assertEquals(1000, fragment.getXpos().intValue()); - assertEquals(12850, fragment.getYpos().intValue()); - assertEquals(1, fragment.getRead().intValue()); - assertEquals(false, fragment.getFilterPassed().booleanValue()); - assertEquals(18, fragment.getControlNumber().intValue()); - assertEquals("ATCACG", fragment.getIndexSequence()); - } - - @Test - public void testIlluminaMetaInfoNegativeXYpos() throws IOException - { - writeToTempFastq(illuminaFastqNegativeXYPos); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNegativeXYPos.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - - assertEquals("EAS139", fragment.getInstrument()); - assertEquals(136, fragment.getRunNumber().intValue()); - assertEquals("FC706VJ", fragment.getFlowcellId()); - assertEquals(2, fragment.getLane().intValue()); - assertEquals(5, fragment.getTile().intValue()); - assertEquals(-1000, fragment.getXpos().intValue()); - assertEquals(-12850, fragment.getYpos().intValue()); - assertEquals(1, fragment.getRead().intValue()); - assertEquals(false, fragment.getFilterPassed().booleanValue()); - assertEquals(18, fragment.getControlNumber().intValue()); - assertEquals("ATCACG", fragment.getIndexSequence()); - } - - @Test - public void testOneIlluminaThenNot() throws IOException - { - writeToTempFastq(illuminaFastq + "\n" + oneFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastq.length() + oneFastq.length() + 1, null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - assertTrue(reader.next(key, fragment)); - assertEquals("EAS139", fragment.getInstrument()); - - assertTrue(reader.next(key, fragment)); - assertNull(fragment.getInstrument()); - - assertFalse(reader.next(key, fragment)); - } - - @Test - public void testOneNotThenIllumina() throws IOException - { - writeToTempFastq(oneFastq + "\n" + illuminaFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastq.length() + oneFastq.length() + 1, null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - assertTrue(reader.next(key, fragment)); - assertNull(fragment.getInstrument()); - - assertTrue(reader.next(key, fragment)); - assertNull(fragment.getInstrument()); - - assertFalse(reader.next(key, fragment)); - } - - @Test - public void testProgress() throws IOException - { - writeToTempFastq(twoFastq); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, twoFastq.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - assertEquals(0.0, reader.getProgress(), 0.01); - - reader.next(key, fragment); - assertEquals(0.5, reader.getProgress(), 0.01); - - reader.next(key, fragment); - assertEquals(1.0, reader.getProgress(), 0.01); - } - - @Test - public void testCreateKey() throws IOException - { - FastqRecordReader reader = createReaderForOneFastq(); - assertTrue(reader.createKey() instanceof Text); - } - - @Test - public void testCreateValue() throws IOException - { - FastqRecordReader reader = createReaderForOneFastq(); - assertTrue(reader.createValue() instanceof SequencedFragment); - } - - @Test - public void testClose() throws IOException - { - FastqRecordReader reader = createReaderForOneFastq(); - // doesn't really do anything but exercise the code - reader.close(); - } - - @Test - public void testReadFastqWithIdTwice() throws IOException - { - writeToTempFastq(fastqWithIdTwice); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - assertEquals("###########################################################################################", fragment.getQuality().toString()); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testReadFastqWithAmpersandQuality() throws IOException - { - writeToTempFastq(fastqWithAmpersandQuality); - // split doesn't start at 0, forcing reader to advance looking for first complete record - split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - assertEquals("###########################################################################################", fragment.getQuality().toString()); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testMakePositionMessage() throws IOException - { - writeToTempFastq(fastqWithIdTwice); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - assertNotNull(reader.makePositionMessage()); - } - - @Test - public void testFastqWithIlluminaEncoding() throws IOException - { - conf.set("hbam.fastq-input.base-quality-encoding", "illumina"); - verifyInputQualityConfig(); - } - - @Test - public void testFastqWithIlluminaEncodingAndGenericInputConfig() throws IOException - { - conf.set("hbam.input.base-quality-encoding", "illumina"); - verifyInputQualityConfig(); - } - - private void verifyInputQualityConfig() throws IOException - { - writeToTempFastq(illuminaFastqWithPhred64Quality); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqWithPhred64Quality.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - assertEquals("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", fragment.getQuality().toString()); - } - - @Test - public void testGzCompressedInput() throws IOException - { - // write gzip-compressed data - GzipCodec codec = new GzipCodec(); - PrintWriter fastqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); - fastqOut.write(twoFastq); - fastqOut.close(); - - // now try to read it - split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoFastq.length(), null); - FastqRecordReader reader = new FastqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - - retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); - assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); - } - - @Test(expected=RuntimeException.class) - public void testCompressedSplit() throws IOException - { - // write gzip-compressed data - GzipCodec codec = new GzipCodec(); - PrintWriter fastqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); - fastqOut.write(twoFastq); - fastqOut.close(); - - // now try to read it starting from the middle - split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoFastq.length(), null); - FastqRecordReader reader = new FastqRecordReader(conf, split); - } - - @Test - public void testIlluminaNoIndex() throws IOException - { - writeToTempFastq(illuminaFastqNoIndex); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoIndex.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - - // ensure all meta-data was picked up - assertEquals("EAS139", fragment.getInstrument()); - assertEquals(136, fragment.getRunNumber().intValue()); - // now verify the index - assertEquals("", fragment.getIndexSequence()); - } - - @Test - public void testSkipFailedQC() throws IOException - { - conf.set("hbam.fastq-input.filter-failed-qc", "true"); - verifySkipFailedQC(); - } - - @Test - public void testSkipFailedQCGenericConfig() throws IOException - { - conf.set("hbam.input.filter-failed-qc", "true"); - verifySkipFailedQC(); - } - - private void verifySkipFailedQC() throws IOException - { - writeToTempFastq(twoFastqWithIllumina); - split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, twoFastqWithIllumina.length(), null); - - FastqRecordReader reader = new FastqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - assertEquals(2, (int)fragment.getRead()); - - found = reader.next(key, fragment); - assertTrue(found); - assertEquals(3, (int)fragment.getRead()); - - found = reader.next(key, fragment); - assertFalse(found); - } - - public static void main(String args[]) { - org.junit.runner.JUnitCore.main(TestFastqInputFormat.class.getName()); - } +public class TestFastqInputFormat { + + public static final String oneFastq = + "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################"; + + public static final String twoFastq = + "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################\n" + + "@ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1\n" + + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\n" + + "+\n" + + "BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################"; + + public static final String illuminaFastq = + "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "##########################################################################################~"; + + public static final String illuminaFastqWithPhred64Quality = + "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; + + public static final String oneFastqWithoutRead = + "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################"; + + public static final String fastqWithIdTwice = + "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "###########################################################################################"; + + public static final String fastqWithAmpersandQuality = + "+lousy.id HWI-ST168_161:1:1:1373:2042/1\n" + + "@##########################################################################################\n" + + "@ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1\n" + + "###########################################################################################"; + + public static final String illuminaFastqNoFlowCellID = + "@EAS139:136::2:5:1000:12850 1:Y:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################"; + + public static final String illuminaFastqNegativeXYPos = + "@EAS139:136:FC706VJ:2:5:-1000:-12850 1:Y:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################"; + + public static final String illuminaFastqNoIndex = + "@EAS139:136::2:5:1000:12850 1:Y:18:\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################"; + + public static final String twoFastqWithIllumina = + "@EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################\n" + + "@EAS139:136:FC706VJ:2:5:1000:12850 2:N:18:ATCACG\n" + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\n" + + "+\n" + + "###########################################################################################\n" + + "@EAS139:136:FC706VJ:2:5:1000:12850 3:N:18:ATCACG\n" + + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\n" + + "+\n" + + "BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################"; + + private JobConf conf; + private FileSplit split; + private File tempFastq; + private File tempGz; + + private Text key; + private SequencedFragment fragment; + + public static void main(String args[]) { + org.junit.runner.JUnitCore.main(TestFastqInputFormat.class.getName()); + } + + @Before + public void setup() throws IOException { + tempFastq = File.createTempFile("test_fastq_input_format", "fastq"); + tempGz = File.createTempFile("test_fastq_input_format", ".gz"); + conf = new JobConf(); + key = new Text(); + fragment = new SequencedFragment(); + } + + @After + public void tearDown() { + tempFastq.delete(); + tempGz.delete(); + split = null; + } + + private void writeToTempFastq(String s) throws IOException { + PrintWriter fastqOut = new PrintWriter(new BufferedWriter(new FileWriter(tempFastq))); + fastqOut.write(s); + fastqOut.close(); + } + + private FastqRecordReader createReaderForOneFastq() throws IOException { + writeToTempFastq(oneFastq); + split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, oneFastq.length(), null); + + return new FastqRecordReader(conf, split); + } + + @Test + public void testReadFromStart() throws IOException { + FastqRecordReader reader = createReaderForOneFastq(); + + assertEquals(0, reader.getPos()); + assertEquals(0.0, reader.getProgress(), 0.01); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + assertEquals( + "###########################################################################################", + fragment.getQuality().toString()); + + assertEquals(oneFastq.length(), reader.getPos()); + assertEquals(1.0, reader.getProgress(), 0.01); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testReadStartInMiddle() throws IOException { + writeToTempFastq(twoFastq); + split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + assertEquals( + oneFastq.length() + 1, + reader + .getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq + assertEquals(0.0, reader.getProgress(), 0.01); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); + assertEquals( + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", + fragment.getSequence().toString()); + assertEquals( + "BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", + fragment.getQuality().toString()); + + assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data + assertEquals(1.0, reader.getProgress(), 0.01); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testSliceEndsBeforeEndOfFile() throws IOException { + writeToTempFastq(twoFastq); + // slice ends at position 10--i.e. somewhere in the first record. The second record should not + // be read. + split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, 10, null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); + + assertFalse( + "FastqRecordReader is reading a record that starts after the end of the slice", + reader.next(key, fragment)); + } + + @Test + public void testGetReadNumFromName() throws IOException { + FastqRecordReader reader = createReaderForOneFastq(); + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals(1, fragment.getRead().intValue()); + } + + @Test + public void testNameWithoutReadNum() throws IOException { + writeToTempFastq(oneFastqWithoutRead); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 0, oneFastqWithoutRead.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertNull("Read is not null", fragment.getRead()); + } + + @Test + public void testIlluminaMetaInfo() throws IOException { + writeToTempFastq(illuminaFastq); + split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastq.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + + assertEquals("EAS139", fragment.getInstrument()); + assertEquals(136, fragment.getRunNumber().intValue()); + assertEquals("FC706VJ", fragment.getFlowcellId()); + assertEquals(2, fragment.getLane().intValue()); + assertEquals(5, fragment.getTile().intValue()); + assertEquals(1000, fragment.getXpos().intValue()); + assertEquals(12850, fragment.getYpos().intValue()); + assertEquals(1, fragment.getRead().intValue()); + assertEquals(false, fragment.getFilterPassed().booleanValue()); + assertEquals(18, fragment.getControlNumber().intValue()); + assertEquals("ATCACG", fragment.getIndexSequence()); + } + + @Test + public void testIlluminaMetaInfoNullFC() throws IOException { + writeToTempFastq(illuminaFastqNoFlowCellID); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoFlowCellID.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + + assertEquals("EAS139", fragment.getInstrument()); + assertEquals(136, fragment.getRunNumber().intValue()); + assertEquals("", fragment.getFlowcellId()); + assertEquals(2, fragment.getLane().intValue()); + assertEquals(5, fragment.getTile().intValue()); + assertEquals(1000, fragment.getXpos().intValue()); + assertEquals(12850, fragment.getYpos().intValue()); + assertEquals(1, fragment.getRead().intValue()); + assertEquals(false, fragment.getFilterPassed().booleanValue()); + assertEquals(18, fragment.getControlNumber().intValue()); + assertEquals("ATCACG", fragment.getIndexSequence()); + } + + @Test + public void testIlluminaMetaInfoNegativeXYpos() throws IOException { + writeToTempFastq(illuminaFastqNegativeXYPos); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 0, illuminaFastqNegativeXYPos.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + + assertEquals("EAS139", fragment.getInstrument()); + assertEquals(136, fragment.getRunNumber().intValue()); + assertEquals("FC706VJ", fragment.getFlowcellId()); + assertEquals(2, fragment.getLane().intValue()); + assertEquals(5, fragment.getTile().intValue()); + assertEquals(-1000, fragment.getXpos().intValue()); + assertEquals(-12850, fragment.getYpos().intValue()); + assertEquals(1, fragment.getRead().intValue()); + assertEquals(false, fragment.getFilterPassed().booleanValue()); + assertEquals(18, fragment.getControlNumber().intValue()); + assertEquals("ATCACG", fragment.getIndexSequence()); + } + + @Test + public void testOneIlluminaThenNot() throws IOException { + writeToTempFastq(illuminaFastq + "\n" + oneFastq); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), + 0, + illuminaFastq.length() + oneFastq.length() + 1, + null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + assertTrue(reader.next(key, fragment)); + assertEquals("EAS139", fragment.getInstrument()); + + assertTrue(reader.next(key, fragment)); + assertNull(fragment.getInstrument()); + + assertFalse(reader.next(key, fragment)); + } + + @Test + public void testOneNotThenIllumina() throws IOException { + writeToTempFastq(oneFastq + "\n" + illuminaFastq); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), + 0, + illuminaFastq.length() + oneFastq.length() + 1, + null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + assertTrue(reader.next(key, fragment)); + assertNull(fragment.getInstrument()); + + assertTrue(reader.next(key, fragment)); + assertNull(fragment.getInstrument()); + + assertFalse(reader.next(key, fragment)); + } + + @Test + public void testProgress() throws IOException { + writeToTempFastq(twoFastq); + split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, twoFastq.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + assertEquals(0.0, reader.getProgress(), 0.01); + + reader.next(key, fragment); + assertEquals(0.5, reader.getProgress(), 0.01); + + reader.next(key, fragment); + assertEquals(1.0, reader.getProgress(), 0.01); + } + + @Test + public void testCreateKey() throws IOException { + FastqRecordReader reader = createReaderForOneFastq(); + assertTrue(reader.createKey() instanceof Text); + } + + @Test + public void testCreateValue() throws IOException { + FastqRecordReader reader = createReaderForOneFastq(); + assertTrue(reader.createValue() instanceof SequencedFragment); + } + + @Test + public void testClose() throws IOException { + FastqRecordReader reader = createReaderForOneFastq(); + // doesn't really do anything but exercise the code + reader.close(); + } + + @Test + public void testReadFastqWithIdTwice() throws IOException { + writeToTempFastq(fastqWithIdTwice); + split = + new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + assertEquals( + "###########################################################################################", + fragment.getQuality().toString()); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testReadFastqWithAmpersandQuality() throws IOException { + writeToTempFastq(fastqWithAmpersandQuality); + // split doesn't start at 0, forcing reader to advance looking for first complete record + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + assertEquals( + "###########################################################################################", + fragment.getQuality().toString()); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testMakePositionMessage() throws IOException { + writeToTempFastq(fastqWithIdTwice); + split = + new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + assertNotNull(reader.makePositionMessage()); + } + + @Test + public void testFastqWithIlluminaEncoding() throws IOException { + conf.set("hbam.fastq-input.base-quality-encoding", "illumina"); + verifyInputQualityConfig(); + } + + @Test + public void testFastqWithIlluminaEncodingAndGenericInputConfig() throws IOException { + conf.set("hbam.input.base-quality-encoding", "illumina"); + verifyInputQualityConfig(); + } + + private void verifyInputQualityConfig() throws IOException { + writeToTempFastq(illuminaFastqWithPhred64Quality); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), + 0, + illuminaFastqWithPhred64Quality.length(), + null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + assertEquals( + "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", + fragment.getQuality().toString()); + } + + @Test + public void testGzCompressedInput() throws IOException { + // write gzip-compressed data + GzipCodec codec = new GzipCodec(); + PrintWriter fastqOut = + new PrintWriter( + new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz)))); + fastqOut.write(twoFastq); + fastqOut.close(); + + // now try to read it + split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoFastq.length(), null); + FastqRecordReader reader = new FastqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + + retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); + assertEquals( + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", + fragment.getSequence().toString()); + } + + @Test(expected = RuntimeException.class) + public void testCompressedSplit() throws IOException { + // write gzip-compressed data + GzipCodec codec = new GzipCodec(); + PrintWriter fastqOut = + new PrintWriter( + new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz)))); + fastqOut.write(twoFastq); + fastqOut.close(); + + // now try to read it starting from the middle + split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoFastq.length(), null); + FastqRecordReader reader = new FastqRecordReader(conf, split); + } + + @Test + public void testIlluminaNoIndex() throws IOException { + writeToTempFastq(illuminaFastqNoIndex); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoIndex.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + + // ensure all meta-data was picked up + assertEquals("EAS139", fragment.getInstrument()); + assertEquals(136, fragment.getRunNumber().intValue()); + // now verify the index + assertEquals("", fragment.getIndexSequence()); + } + + @Test + public void testSkipFailedQC() throws IOException { + conf.set("hbam.fastq-input.filter-failed-qc", "true"); + verifySkipFailedQC(); + } + + @Test + public void testSkipFailedQCGenericConfig() throws IOException { + conf.set("hbam.input.filter-failed-qc", "true"); + verifySkipFailedQC(); + } + + private void verifySkipFailedQC() throws IOException { + writeToTempFastq(twoFastqWithIllumina); + split = + new FileSplit( + new Path(tempFastq.toURI().toString()), 0, twoFastqWithIllumina.length(), null); + + FastqRecordReader reader = new FastqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + assertEquals(2, (int) fragment.getRead()); + + found = reader.next(key, fragment); + assertTrue(found); + assertEquals(3, (int) fragment.getRead()); + + found = reader.next(key, fragment); + assertFalse(found); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestFastqOutputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestFastqOutputFormat.java index c57085d..ed6543e 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestFastqOutputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestFastqOutputFormat.java @@ -22,185 +22,170 @@ package org.seqdoop.hadoop_bam; -import org.seqdoop.hadoop_bam.FastqOutputFormat.FastqRecordWriter; -import org.seqdoop.hadoop_bam.SequencedFragment; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; - -import java.io.DataOutputStream; -import java.io.ByteArrayOutputStream; - -import org.junit.*; import static org.junit.Assert.*; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.junit.*; +import org.seqdoop.hadoop_bam.FastqOutputFormat.FastqRecordWriter; -public class TestFastqOutputFormat -{ - private SequencedFragment fragment; - - private ByteArrayOutputStream outputBuffer; - private DataOutputStream dataOutput; - private FastqRecordWriter writer; - - @Before - public void setup() throws IOException - { - fragment = new SequencedFragment(); - fragment.setInstrument("instrument"); - fragment.setRunNumber(1); - fragment.setFlowcellId("xyz"); - fragment.setLane(2); - fragment.setTile(1001); - fragment.setXpos(10000); - fragment.setYpos(9999); - fragment.setRead(1); - fragment.setFilterPassed(true); - fragment.setControlNumber(33); - fragment.setIndexSequence("CATCAT"); - fragment.setSequence(new Text("AAAAAAAAAA")); - fragment.setQuality(new Text("##########")); - - outputBuffer = new ByteArrayOutputStream(); - dataOutput = new DataOutputStream(outputBuffer); - writer = new FastqRecordWriter(new Configuration(), dataOutput); - } - - @Test - public void testSimple() throws IOException - { - writer.write(null, fragment); - writer.close(null); - - String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); - assertEquals(4, lines.length); - - String idLine = lines[0]; - assertTrue(idLine.startsWith("@")); - - compareMetadata(fragment, idLine); - - assertEquals(fragment.getSequence().toString(), lines[1]); - assertEquals("+", lines[2]); - assertEquals(fragment.getQuality().toString(), lines[3]); - } - - @Test - public void testNullControlNumber() throws IOException - { - fragment.setControlNumber(null); - writer.write(null, fragment); - writer.close(null); - - String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); - assertEquals(4, lines.length); - - String idLine = lines[0]; - assertTrue(idLine.startsWith("@")); - - fragment.setControlNumber(0); // when null 0 should be written - compareMetadata(fragment, idLine); - } - - @Test - public void testNullFilter() throws IOException - { - fragment.setFilterPassed(null); - writer.write(null, fragment); - writer.close(null); - - String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); - assertEquals(4, lines.length); - - String idLine = lines[0]; - assertTrue(idLine.startsWith("@")); - - fragment.setFilterPassed(true); // when filter not available then it passes - compareMetadata(fragment, idLine); - } - - @Test - public void testCustomId() throws IOException - { - String customKey = "hello"; - writer.write(new Text(customKey), fragment); - writer.close(null); - - String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); - assertEquals(4, lines.length); - - String idLine = lines[0]; - assertTrue(idLine.startsWith("@")); - assertEquals(customKey, idLine.substring(1)); - } - - @Test - public void testBaseQualitiesInIllumina() throws IOException - { - Configuration conf = new Configuration(); - conf.set("hbam.fastq-output.base-quality-encoding", "illumina"); - writer.setConf(conf); - - // ensure sanger qualities are converted to illumina - String seq = "AAAAAAAAAA"; - String qual = "##########"; - - fragment.setSequence(new Text(seq)); - fragment.setQuality(new Text(qual)); - - writer.write(null, fragment); - writer.close(null); - - String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); - assertEquals(qual.replace("#", "B"), lines[3]); - } - - @Test - public void testConfigureOutputInSanger() throws IOException - { - Configuration conf = new Configuration(); - conf.set("hbam.fastq-output.base-quality-encoding", "sanger"); - writer.setConf(conf); - testSimple(); - } - - @Test(expected=RuntimeException.class) - public void testBadConfig() throws IOException - { - Configuration conf = new Configuration(); - conf.set("hbam.fastq-output.base-quality-encoding", "blalbal"); - writer.setConf(conf); - } - - @Test - public void testClose() throws IOException - { - // doesn't really do anything but exercise the code - writer.close(null); - } - - private static void compareMetadata(SequencedFragment fragment, String idLine) - { - idLine = idLine.substring(1); - String[] pieces = idLine.split(" ")[0].split(":"); // first part: location on flowcell - assertEquals(fragment.getInstrument(), pieces[0]); - assertEquals(fragment.getRunNumber().toString(), pieces[1]); - assertEquals(fragment.getFlowcellId(), pieces[2]); - assertEquals(fragment.getLane().toString(), pieces[3]); - assertEquals(fragment.getTile().toString(), pieces[4]); - assertEquals(fragment.getXpos().toString(), pieces[5]); - assertEquals(fragment.getYpos().toString(), pieces[6]); - - pieces = idLine.split(" ")[1].split(":"); // second part - assertEquals(fragment.getRead().toString(), pieces[0]); - assertEquals(fragment.getFilterPassed() ? "N" : "Y", pieces[1]); - assertEquals(fragment.getControlNumber().toString(), pieces[2]); - assertEquals(fragment.getIndexSequence().toString(), pieces[3]); - } - - public static void main(String args[]) { - org.junit.runner.JUnitCore.main(TestFastqOutputFormat.class.getName()); - } +public class TestFastqOutputFormat { + + private SequencedFragment fragment; + + private ByteArrayOutputStream outputBuffer; + private DataOutputStream dataOutput; + private FastqRecordWriter writer; + + private static void compareMetadata(SequencedFragment fragment, String idLine) { + idLine = idLine.substring(1); + String[] pieces = idLine.split(" ")[0].split(":"); // first part: location on flowcell + assertEquals(fragment.getInstrument(), pieces[0]); + assertEquals(fragment.getRunNumber().toString(), pieces[1]); + assertEquals(fragment.getFlowcellId(), pieces[2]); + assertEquals(fragment.getLane().toString(), pieces[3]); + assertEquals(fragment.getTile().toString(), pieces[4]); + assertEquals(fragment.getXpos().toString(), pieces[5]); + assertEquals(fragment.getYpos().toString(), pieces[6]); + + pieces = idLine.split(" ")[1].split(":"); // second part + assertEquals(fragment.getRead().toString(), pieces[0]); + assertEquals(fragment.getFilterPassed() ? "N" : "Y", pieces[1]); + assertEquals(fragment.getControlNumber().toString(), pieces[2]); + assertEquals(fragment.getIndexSequence().toString(), pieces[3]); + } + + public static void main(String args[]) { + org.junit.runner.JUnitCore.main(TestFastqOutputFormat.class.getName()); + } + + @Before + public void setup() throws IOException { + fragment = new SequencedFragment(); + fragment.setInstrument("instrument"); + fragment.setRunNumber(1); + fragment.setFlowcellId("xyz"); + fragment.setLane(2); + fragment.setTile(1001); + fragment.setXpos(10000); + fragment.setYpos(9999); + fragment.setRead(1); + fragment.setFilterPassed(true); + fragment.setControlNumber(33); + fragment.setIndexSequence("CATCAT"); + fragment.setSequence(new Text("AAAAAAAAAA")); + fragment.setQuality(new Text("##########")); + + outputBuffer = new ByteArrayOutputStream(); + dataOutput = new DataOutputStream(outputBuffer); + writer = new FastqRecordWriter(new Configuration(), dataOutput); + } + + @Test + public void testSimple() throws IOException { + writer.write(null, fragment); + writer.close(null); + + String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); + assertEquals(4, lines.length); + + String idLine = lines[0]; + assertTrue(idLine.startsWith("@")); + + compareMetadata(fragment, idLine); + + assertEquals(fragment.getSequence().toString(), lines[1]); + assertEquals("+", lines[2]); + assertEquals(fragment.getQuality().toString(), lines[3]); + } + + @Test + public void testNullControlNumber() throws IOException { + fragment.setControlNumber(null); + writer.write(null, fragment); + writer.close(null); + + String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); + assertEquals(4, lines.length); + + String idLine = lines[0]; + assertTrue(idLine.startsWith("@")); + + fragment.setControlNumber(0); // when null 0 should be written + compareMetadata(fragment, idLine); + } + + @Test + public void testNullFilter() throws IOException { + fragment.setFilterPassed(null); + writer.write(null, fragment); + writer.close(null); + + String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); + assertEquals(4, lines.length); + + String idLine = lines[0]; + assertTrue(idLine.startsWith("@")); + + fragment.setFilterPassed(true); // when filter not available then it passes + compareMetadata(fragment, idLine); + } + + @Test + public void testCustomId() throws IOException { + String customKey = "hello"; + writer.write(new Text(customKey), fragment); + writer.close(null); + + String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); + assertEquals(4, lines.length); + + String idLine = lines[0]; + assertTrue(idLine.startsWith("@")); + assertEquals(customKey, idLine.substring(1)); + } + + @Test + public void testBaseQualitiesInIllumina() throws IOException { + Configuration conf = new Configuration(); + conf.set("hbam.fastq-output.base-quality-encoding", "illumina"); + writer.setConf(conf); + + // ensure sanger qualities are converted to illumina + String seq = "AAAAAAAAAA"; + String qual = "##########"; + + fragment.setSequence(new Text(seq)); + fragment.setQuality(new Text(qual)); + + writer.write(null, fragment); + writer.close(null); + + String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); + assertEquals(qual.replace("#", "B"), lines[3]); + } + + @Test + public void testConfigureOutputInSanger() throws IOException { + Configuration conf = new Configuration(); + conf.set("hbam.fastq-output.base-quality-encoding", "sanger"); + writer.setConf(conf); + testSimple(); + } + + @Test(expected = RuntimeException.class) + public void testBadConfig() throws IOException { + Configuration conf = new Configuration(); + conf.set("hbam.fastq-output.base-quality-encoding", "blalbal"); + writer.setConf(conf); + } + + @Test + public void testClose() throws IOException { + // doesn't really do anything but exercise the code + writer.close(null); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java b/src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java index 6924991..6e94a83 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestLineReader.java @@ -22,69 +22,60 @@ package org.seqdoop.hadoop_bam; -import org.junit.*; import static org.junit.Assert.*; -import org.seqdoop.hadoop_bam.LineReader; - -import org.apache.hadoop.io.Text; - import java.io.ByteArrayInputStream; import java.io.IOException; +import org.apache.hadoop.io.Text; +import org.junit.*; -public class TestLineReader -{ - public static final String input10 = "0123456789"; - public static final String input22 = "0123456789\n0987654321\n"; +public class TestLineReader { - private LineReader reader; - private Text dest = new Text(); + public static final String input10 = "0123456789"; + public static final String input22 = "0123456789\n0987654321\n"; - @Test - public void testReadBufferedLine() throws IOException - { - reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); - reader.readLine(dest); - assertEquals("0123456789", dest.toString()); - } + private LineReader reader; + private Text dest = new Text(); - @Test - public void testSkipOnBufferedLine() throws IOException - { - reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); - long skipped = reader.skip(1); - assertEquals(1, skipped); - reader.readLine(dest); - assertEquals("123456789", dest.toString()); - } + @Test + public void testReadBufferedLine() throws IOException { + reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); + reader.readLine(dest); + assertEquals("0123456789", dest.toString()); + } - @Test - public void testReadBeyondBuffer() throws IOException - { - reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); - reader.readLine(dest); - assertEquals("0123456789", dest.toString()); - } + @Test + public void testSkipOnBufferedLine() throws IOException { + reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); + long skipped = reader.skip(1); + assertEquals(1, skipped); + reader.readLine(dest); + assertEquals("123456789", dest.toString()); + } - @Test - public void testSkipBeyondBuffer() throws IOException - { - reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); - long skipped = reader.skip(11); - assertEquals(11, skipped); - reader.readLine(dest); - assertEquals("0987654321", dest.toString()); - } + @Test + public void testReadBeyondBuffer() throws IOException { + reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); + reader.readLine(dest); + assertEquals("0123456789", dest.toString()); + } - @Test - public void testSkipBeyondInput() throws IOException - { - reader = new LineReader(new ByteArrayInputStream(input10.getBytes()), 5); - long skipped = reader.skip(11); - assertEquals(10, skipped); + @Test + public void testSkipBeyondBuffer() throws IOException { + reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); + long skipped = reader.skip(11); + assertEquals(11, skipped); + reader.readLine(dest); + assertEquals("0987654321", dest.toString()); + } - skipped = reader.skip(11); - assertEquals(0, skipped); - } + @Test + public void testSkipBeyondInput() throws IOException { + reader = new LineReader(new ByteArrayInputStream(input10.getBytes()), 5); + long skipped = reader.skip(11); + assertEquals(10, skipped); + skipped = reader.skip(11); + assertEquals(0, skipped); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestQseqInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestQseqInputFormat.java index 359a1e2..50ed517 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestQseqInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestQseqInputFormat.java @@ -22,363 +22,359 @@ package org.seqdoop.hadoop_bam; -import org.seqdoop.hadoop_bam.QseqInputFormat.QseqRecordReader; -import org.seqdoop.hadoop_bam.SequencedFragment; -import org.seqdoop.hadoop_bam.FormatException; +import static org.junit.Assert.*; +import java.io.BufferedOutputStream; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.FileOutputStream; -import java.io.BufferedOutputStream; - -import org.junit.*; -import static org.junit.Assert.*; - -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.junit.*; +import org.seqdoop.hadoop_bam.QseqInputFormat.QseqRecordReader; -public class TestQseqInputFormat -{ - public static final String oneQseq = - "ERR020229 10880 1 1 1373 2042 0 1 " + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + - "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 1"; - - public static final String twoQseq = - "ERR020229 10880 1 1 1373 2042 0 1 " + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + - "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0\n" + - "ERR020229 10883 1 1 1796 2044 0 2 " + - "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\t" + - "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD 1"; - - public static final String illuminaQseq = - "EAS139 136 2 5 1000 12850 ATCACG 1 " + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + - "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"; - - public static final String nQseq = - "ERR020229 10880 1 1 1373 2042 0 1 " + - "...........................................................................................\t" + - "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"; - - - public static final String sangerQseq = - "EAS139 136 2 5 1000 12850 ATCACG 1 " + - "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + - "########################################################################################### 0"; - - public static final String indexWithUnknown = - "EAS139 136 2 5 1000 12850 ATC..G 1 " + - "TTGGATGATAGGGATTATTTGACTCGAATAT\t" + - "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\t0"; - - private JobConf conf; - private FileSplit split; - private File tempQseq; - private File tempGz; - - private Text key; - private SequencedFragment fragment; - - @Before - public void setup() throws IOException - { - tempQseq = File.createTempFile("test_qseq_input_format", "qseq"); - tempGz = File.createTempFile("test_qseq_input_format", ".gz"); - conf = new JobConf(); - key = new Text(); - fragment = new SequencedFragment(); - } - - @After - public void tearDown() - { - tempQseq.delete(); - tempGz.delete(); - split = null; - } - - private void writeToTempQseq(String s) throws IOException - { - PrintWriter qseqOut = new PrintWriter( new BufferedWriter( new FileWriter(tempQseq) ) ); - qseqOut.write(s); - qseqOut.close(); - } - - private QseqRecordReader createReaderForOneQseq() throws IOException - { - writeToTempQseq(oneQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, oneQseq.length(), null); - - return new QseqRecordReader(conf, split); - } - - @Test - public void testReadFromStart() throws IOException - { - QseqRecordReader reader = createReaderForOneQseq(); - - assertEquals(0, reader.getPos()); - assertEquals(0.0, reader.getProgress(), 0.01); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); -//System.err.println("in testReadFromStart quality: " + fragment.getQuality().toString()); - assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - assertEquals("###########################################################################################", fragment.getQuality().toString()); - - assertEquals(oneQseq.length(), reader.getPos()); - assertEquals(1.0, reader.getProgress(), 0.01); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testReadStartInMiddle() throws IOException - { - writeToTempQseq(twoQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - - assertEquals(oneQseq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneQseq - assertEquals(0.0, reader.getProgress(), 0.01); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); - assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); - assertEquals("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%", fragment.getQuality().toString()); - - assertEquals(twoQseq.length(), reader.getPos()); // now should be at the end of the data - assertEquals(1.0, reader.getProgress(), 0.01); - - retval = reader.next(key, fragment); - assertFalse(retval); - } - - @Test - public void testSliceEndsBeforeEndOfFile() throws IOException - { - writeToTempQseq(twoQseq); - // slice ends at position 10--i.e. somewhere in the first record. The second record should not be read. - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, 10, null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); - - assertFalse("QseqRecordReader is reading a record that starts after the end of the slice", reader.next(key, fragment)); - } - - @Test - public void testIlluminaMetaInfo() throws IOException - { - writeToTempQseq(illuminaQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, illuminaQseq.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - - assertEquals("EAS139", fragment.getInstrument()); - assertEquals(136, fragment.getRunNumber().intValue()); - assertNull("flowcell id not null", fragment.getFlowcellId()); - assertEquals(2, fragment.getLane().intValue()); - assertEquals(5, fragment.getTile().intValue()); - assertEquals(1000, fragment.getXpos().intValue()); - assertEquals(12850, fragment.getYpos().intValue()); - assertEquals(1, fragment.getRead().intValue()); - assertEquals(false, fragment.getFilterPassed().booleanValue()); - assertNull("control number not null", fragment.getControlNumber()); - assertEquals("ATCACG", fragment.getIndexSequence()); - } - - @Test - public void testNs() throws IOException - { - writeToTempQseq(nQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, nQseq.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - assertEquals("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", fragment.getSequence().toString()); - } - - @Test - public void testConvertDotInIndexSequence() throws IOException - { - writeToTempQseq(indexWithUnknown); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, indexWithUnknown.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - boolean found = reader.next(key, fragment); - assertTrue(found); - assertEquals("ATCNNG", fragment.getIndexSequence()); - } - - @Test(expected=FormatException.class) - public void testSangerQualities() throws IOException - { - writeToTempQseq(sangerQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, sangerQseq.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - reader.next(key, fragment); - } - - @Test - public void testConfigureForSangerQualities() throws IOException - { - conf.set("hbam.qseq-input.base-quality-encoding", "sanger"); - qualityConfigTest(); - } - - @Test - public void testGenericInputConfigureForSangerQualities() throws IOException - { - conf.set("hbam.input.base-quality-encoding", "sanger"); - qualityConfigTest(); - } - - private void qualityConfigTest() throws IOException - { - writeToTempQseq(sangerQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, sangerQseq.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - assertTrue(reader.next(key, fragment)); - assertEquals("###########################################################################################", fragment.getQuality().toString()); - } - - @Test - public void testProgress() throws IOException - { - writeToTempQseq(twoQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - assertEquals(0.0, reader.getProgress(), 0.01); - - reader.next(key, fragment); - assertEquals(0.5, reader.getProgress(), 0.01); - - reader.next(key, fragment); - assertEquals(1.0, reader.getProgress(), 0.01); - } - - @Test - public void testCreateKey() throws IOException - { - QseqRecordReader reader = createReaderForOneQseq(); - assertTrue(reader.createKey() instanceof Text); - } - - @Test - public void testCreateValue() throws IOException - { - QseqRecordReader reader = createReaderForOneQseq(); - assertTrue(reader.createValue() instanceof SequencedFragment); - } - - @Test - public void testClose() throws IOException - { - QseqRecordReader reader = createReaderForOneQseq(); - // doesn't really do anything but exercise the code - reader.close(); - } - - @Test - public void testMakePositionMessage() throws IOException - { - writeToTempQseq(twoQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); - - QseqRecordReader reader = new QseqRecordReader(conf, split); - assertNotNull(reader.makePositionMessage()); - } - - @Test - public void testGzCompressedInput() throws IOException - { - // write gzip-compressed data - GzipCodec codec = new GzipCodec(); - PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); - qseqOut.write(twoQseq); - qseqOut.close(); - - // now try to read it - split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null); - QseqRecordReader reader = new QseqRecordReader(conf, split); - - boolean retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); - assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); - - retval = reader.next(key, fragment); - assertTrue(retval); - assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); - assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); - } - - @Test(expected=RuntimeException.class) - public void testCompressedSplit() throws IOException - { - // write gzip-compressed data - GzipCodec codec = new GzipCodec(); - PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); - qseqOut.write(twoQseq); - qseqOut.close(); - - // now try to read it starting from the middle - split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoQseq.length(), null); - QseqRecordReader reader = new QseqRecordReader(conf, split); - } - @Test - public void testSkipFailedQC() throws IOException - { - conf.set("hbam.qseq-input.filter-failed-qc", "t"); - verifySkipFailedQC(); - } - - @Test - public void testSkipFailedQCGenericConfig() throws IOException - { - conf.set("hbam.input.filter-failed-qc", "t"); - verifySkipFailedQC(); - } - - private void verifySkipFailedQC() throws IOException - { - writeToTempQseq(twoQseq); - split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null); - QseqRecordReader reader = new QseqRecordReader(conf, split); - - boolean found = reader.next(key, fragment); - assertTrue(found); - assertEquals(2, (int)fragment.getRead()); - - found = reader.next(key, fragment); - assertFalse(found); - } - - public static void main(String args[]) { - org.junit.runner.JUnitCore.main(TestQseqInputFormat.class.getName()); - } +public class TestQseqInputFormat { + + public static final String oneQseq = + "ERR020229 10880 1 1 1373 2042 0 1 " + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + + "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 1"; + + public static final String twoQseq = + "ERR020229 10880 1 1 1373 2042 0 1 " + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + + "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0\n" + + "ERR020229 10883 1 1 1796 2044 0 2 " + + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG\t" + + "DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD 1"; + + public static final String illuminaQseq = + "EAS139 136 2 5 1000 12850 ATCACG 1 " + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + + "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"; + + public static final String nQseq = + "ERR020229 10880 1 1 1373 2042 0 1 " + + "...........................................................................................\t" + + "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB 0"; + + public static final String sangerQseq = + "EAS139 136 2 5 1000 12850 ATCACG 1 " + + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT\t" + + "########################################################################################### 0"; + + public static final String indexWithUnknown = + "EAS139 136 2 5 1000 12850 ATC..G 1 " + + "TTGGATGATAGGGATTATTTGACTCGAATAT\t" + + "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB\t0"; + + private JobConf conf; + private FileSplit split; + private File tempQseq; + private File tempGz; + + private Text key; + private SequencedFragment fragment; + + public static void main(String args[]) { + org.junit.runner.JUnitCore.main(TestQseqInputFormat.class.getName()); + } + + @Before + public void setup() throws IOException { + tempQseq = File.createTempFile("test_qseq_input_format", "qseq"); + tempGz = File.createTempFile("test_qseq_input_format", ".gz"); + conf = new JobConf(); + key = new Text(); + fragment = new SequencedFragment(); + } + + @After + public void tearDown() { + tempQseq.delete(); + tempGz.delete(); + split = null; + } + + private void writeToTempQseq(String s) throws IOException { + PrintWriter qseqOut = new PrintWriter(new BufferedWriter(new FileWriter(tempQseq))); + qseqOut.write(s); + qseqOut.close(); + } + + private QseqRecordReader createReaderForOneQseq() throws IOException { + writeToTempQseq(oneQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, oneQseq.length(), null); + + return new QseqRecordReader(conf, split); + } + + @Test + public void testReadFromStart() throws IOException { + QseqRecordReader reader = createReaderForOneQseq(); + + assertEquals(0, reader.getPos()); + assertEquals(0.0, reader.getProgress(), 0.01); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + // System.err.println("in testReadFromStart quality: " + fragment.getQuality().toString()); + assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + assertEquals( + "###########################################################################################", + fragment.getQuality().toString()); + + assertEquals(oneQseq.length(), reader.getPos()); + assertEquals(1.0, reader.getProgress(), 0.01); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testReadStartInMiddle() throws IOException { + writeToTempQseq(twoQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + + assertEquals( + oneQseq.length() + 1, + reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneQseq + assertEquals(0.0, reader.getProgress(), 0.01); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); + assertEquals( + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", + fragment.getSequence().toString()); + assertEquals( + "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%", + fragment.getQuality().toString()); + + assertEquals(twoQseq.length(), reader.getPos()); // now should be at the end of the data + assertEquals(1.0, reader.getProgress(), 0.01); + + retval = reader.next(key, fragment); + assertFalse(retval); + } + + @Test + public void testSliceEndsBeforeEndOfFile() throws IOException { + writeToTempQseq(twoQseq); + // slice ends at position 10--i.e. somewhere in the first record. The second record should not + // be read. + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, 10, null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); + + assertFalse( + "QseqRecordReader is reading a record that starts after the end of the slice", + reader.next(key, fragment)); + } + + @Test + public void testIlluminaMetaInfo() throws IOException { + writeToTempQseq(illuminaQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, illuminaQseq.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + + assertEquals("EAS139", fragment.getInstrument()); + assertEquals(136, fragment.getRunNumber().intValue()); + assertNull("flowcell id not null", fragment.getFlowcellId()); + assertEquals(2, fragment.getLane().intValue()); + assertEquals(5, fragment.getTile().intValue()); + assertEquals(1000, fragment.getXpos().intValue()); + assertEquals(12850, fragment.getYpos().intValue()); + assertEquals(1, fragment.getRead().intValue()); + assertEquals(false, fragment.getFilterPassed().booleanValue()); + assertNull("control number not null", fragment.getControlNumber()); + assertEquals("ATCACG", fragment.getIndexSequence()); + } + + @Test + public void testNs() throws IOException { + writeToTempQseq(nQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, nQseq.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + assertEquals( + "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", + fragment.getSequence().toString()); + } + + @Test + public void testConvertDotInIndexSequence() throws IOException { + writeToTempQseq(indexWithUnknown); + split = + new FileSplit(new Path(tempQseq.toURI().toString()), 0, indexWithUnknown.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + boolean found = reader.next(key, fragment); + assertTrue(found); + assertEquals("ATCNNG", fragment.getIndexSequence()); + } + + @Test(expected = FormatException.class) + public void testSangerQualities() throws IOException { + writeToTempQseq(sangerQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, sangerQseq.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + reader.next(key, fragment); + } + + @Test + public void testConfigureForSangerQualities() throws IOException { + conf.set("hbam.qseq-input.base-quality-encoding", "sanger"); + qualityConfigTest(); + } + + @Test + public void testGenericInputConfigureForSangerQualities() throws IOException { + conf.set("hbam.input.base-quality-encoding", "sanger"); + qualityConfigTest(); + } + + private void qualityConfigTest() throws IOException { + writeToTempQseq(sangerQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, sangerQseq.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + assertTrue(reader.next(key, fragment)); + assertEquals( + "###########################################################################################", + fragment.getQuality().toString()); + } + + @Test + public void testProgress() throws IOException { + writeToTempQseq(twoQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + assertEquals(0.0, reader.getProgress(), 0.01); + + reader.next(key, fragment); + assertEquals(0.5, reader.getProgress(), 0.01); + + reader.next(key, fragment); + assertEquals(1.0, reader.getProgress(), 0.01); + } + + @Test + public void testCreateKey() throws IOException { + QseqRecordReader reader = createReaderForOneQseq(); + assertTrue(reader.createKey() instanceof Text); + } + + @Test + public void testCreateValue() throws IOException { + QseqRecordReader reader = createReaderForOneQseq(); + assertTrue(reader.createValue() instanceof SequencedFragment); + } + + @Test + public void testClose() throws IOException { + QseqRecordReader reader = createReaderForOneQseq(); + // doesn't really do anything but exercise the code + reader.close(); + } + + @Test + public void testMakePositionMessage() throws IOException { + writeToTempQseq(twoQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); + + QseqRecordReader reader = new QseqRecordReader(conf, split); + assertNotNull(reader.makePositionMessage()); + } + + @Test + public void testGzCompressedInput() throws IOException { + // write gzip-compressed data + GzipCodec codec = new GzipCodec(); + PrintWriter qseqOut = + new PrintWriter( + new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz)))); + qseqOut.write(twoQseq); + qseqOut.close(); + + // now try to read it + split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null); + QseqRecordReader reader = new QseqRecordReader(conf, split); + + boolean retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); + assertEquals( + "TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", + fragment.getSequence().toString()); + + retval = reader.next(key, fragment); + assertTrue(retval); + assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); + assertEquals( + "TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", + fragment.getSequence().toString()); + } + + @Test(expected = RuntimeException.class) + public void testCompressedSplit() throws IOException { + // write gzip-compressed data + GzipCodec codec = new GzipCodec(); + PrintWriter qseqOut = + new PrintWriter( + new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz)))); + qseqOut.write(twoQseq); + qseqOut.close(); + + // now try to read it starting from the middle + split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoQseq.length(), null); + QseqRecordReader reader = new QseqRecordReader(conf, split); + } + + @Test + public void testSkipFailedQC() throws IOException { + conf.set("hbam.qseq-input.filter-failed-qc", "t"); + verifySkipFailedQC(); + } + + @Test + public void testSkipFailedQCGenericConfig() throws IOException { + conf.set("hbam.input.filter-failed-qc", "t"); + verifySkipFailedQC(); + } + + private void verifySkipFailedQC() throws IOException { + writeToTempQseq(twoQseq); + split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null); + QseqRecordReader reader = new QseqRecordReader(conf, split); + + boolean found = reader.next(key, fragment); + assertTrue(found); + assertEquals(2, (int) fragment.getRead()); + + found = reader.next(key, fragment); + assertFalse(found); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestQseqOutputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestQseqOutputFormat.java index 67325ab..15154bf 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestQseqOutputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestQseqOutputFormat.java @@ -22,151 +22,140 @@ package org.seqdoop.hadoop_bam; -import org.seqdoop.hadoop_bam.QseqOutputFormat.QseqRecordWriter; -import org.seqdoop.hadoop_bam.SequencedFragment; +import static org.junit.Assert.*; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; - -import java.io.DataOutputStream; -import java.io.ByteArrayOutputStream; - -import org.junit.*; -import static org.junit.Assert.*; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; +import org.junit.*; +import org.seqdoop.hadoop_bam.QseqOutputFormat.QseqRecordWriter; -public class TestQseqOutputFormat -{ - private SequencedFragment fragment; - - private ByteArrayOutputStream outputBuffer; - private DataOutputStream dataOutput; - private QseqRecordWriter writer; - - @Before - public void setup() throws IOException - { - fragment = new SequencedFragment(); - fragment.setInstrument("instrument"); - fragment.setRunNumber(1); - fragment.setFlowcellId("xyz"); - fragment.setLane(2); - fragment.setTile(1001); - fragment.setXpos(10000); - fragment.setYpos(9999); - fragment.setRead(1); - fragment.setFilterPassed(true); - fragment.setIndexSequence("CATCAT"); - fragment.setSequence(new Text("AAAAAAAAAA")); - fragment.setQuality(new Text("##########")); - - outputBuffer = new ByteArrayOutputStream(); - dataOutput = new DataOutputStream(outputBuffer); - writer = new QseqRecordWriter(new Configuration(), dataOutput); - } - - @Test - public void testSimple() throws IOException - { - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(11, fields.length); - - assertEquals(fragment.getInstrument(), fields[0]); - assertEquals(fragment.getRunNumber().toString(), fields[1]); - assertEquals(fragment.getLane().toString(), fields[2]); - assertEquals(fragment.getTile().toString(), fields[3]); - assertEquals(fragment.getXpos().toString(), fields[4]); - assertEquals(fragment.getYpos().toString(), fields[5]); - assertEquals(fragment.getIndexSequence().toString(), fields[6]); - assertEquals(fragment.getRead().toString(), fields[7]); - assertEquals(fragment.getSequence().toString(), fields[8]); - assertEquals(fragment.getQuality().toString().replace('#', 'B'), fields[9]); - assertEquals(fragment.getFilterPassed() ? "1\n" : "0\n", fields[10]); - } - - @Test - public void testConvertUnknowns() throws IOException, UnsupportedEncodingException - { - String seq = "AAAAANNNNN"; - fragment.setSequence(new Text(seq)); - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(seq.replace("N", "."), fields[8]); - } - - @Test - public void testConvertUnknownsInIndexSequence() throws IOException, UnsupportedEncodingException - { - String index = "CATNNN"; - fragment.setIndexSequence(index); - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(index.replace("N", "."), fields[6]); - } - - @Test - public void testBaseQualities() throws IOException - { - // ensure sanger qualities are converted to illumina - String seq = "AAAAAAAAAA"; - String qual = "##########"; - - fragment.setSequence(new Text(seq)); - fragment.setQuality(new Text(qual)); - - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(qual.replace("#", "B"), fields[9]); - } - - @Test - public void testConfigureOutputInSanger() throws IOException - { - String seq = "AAAAAAAAAA"; - String qual = "##########"; - - fragment.setSequence(new Text(seq)); - fragment.setQuality(new Text(qual)); - - Configuration conf = new Configuration(); - conf.set("hbam.qseq-output.base-quality-encoding", "sanger"); - writer.setConf(conf); - - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(qual, fields[9]); - } - - @Test - public void testClose() throws IOException - { - // doesn't really do anything but exercise the code - writer.close(null); - } - - @Test - public void testNoIndex() throws IOException - { - fragment.setIndexSequence(null); - writer.write(null, fragment); - writer.close(null); - - String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); - assertEquals(11, fields.length); - - assertEquals("0", fields[6]); - } +public class TestQseqOutputFormat { + + private SequencedFragment fragment; + + private ByteArrayOutputStream outputBuffer; + private DataOutputStream dataOutput; + private QseqRecordWriter writer; + + @Before + public void setup() throws IOException { + fragment = new SequencedFragment(); + fragment.setInstrument("instrument"); + fragment.setRunNumber(1); + fragment.setFlowcellId("xyz"); + fragment.setLane(2); + fragment.setTile(1001); + fragment.setXpos(10000); + fragment.setYpos(9999); + fragment.setRead(1); + fragment.setFilterPassed(true); + fragment.setIndexSequence("CATCAT"); + fragment.setSequence(new Text("AAAAAAAAAA")); + fragment.setQuality(new Text("##########")); + + outputBuffer = new ByteArrayOutputStream(); + dataOutput = new DataOutputStream(outputBuffer); + writer = new QseqRecordWriter(new Configuration(), dataOutput); + } + + @Test + public void testSimple() throws IOException { + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(11, fields.length); + + assertEquals(fragment.getInstrument(), fields[0]); + assertEquals(fragment.getRunNumber().toString(), fields[1]); + assertEquals(fragment.getLane().toString(), fields[2]); + assertEquals(fragment.getTile().toString(), fields[3]); + assertEquals(fragment.getXpos().toString(), fields[4]); + assertEquals(fragment.getYpos().toString(), fields[5]); + assertEquals(fragment.getIndexSequence().toString(), fields[6]); + assertEquals(fragment.getRead().toString(), fields[7]); + assertEquals(fragment.getSequence().toString(), fields[8]); + assertEquals(fragment.getQuality().toString().replace('#', 'B'), fields[9]); + assertEquals(fragment.getFilterPassed() ? "1\n" : "0\n", fields[10]); + } + + @Test + public void testConvertUnknowns() throws IOException, UnsupportedEncodingException { + String seq = "AAAAANNNNN"; + fragment.setSequence(new Text(seq)); + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(seq.replace("N", "."), fields[8]); + } + + @Test + public void testConvertUnknownsInIndexSequence() + throws IOException, UnsupportedEncodingException { + String index = "CATNNN"; + fragment.setIndexSequence(index); + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(index.replace("N", "."), fields[6]); + } + + @Test + public void testBaseQualities() throws IOException { + // ensure sanger qualities are converted to illumina + String seq = "AAAAAAAAAA"; + String qual = "##########"; + + fragment.setSequence(new Text(seq)); + fragment.setQuality(new Text(qual)); + + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(qual.replace("#", "B"), fields[9]); + } + + @Test + public void testConfigureOutputInSanger() throws IOException { + String seq = "AAAAAAAAAA"; + String qual = "##########"; + + fragment.setSequence(new Text(seq)); + fragment.setQuality(new Text(qual)); + + Configuration conf = new Configuration(); + conf.set("hbam.qseq-output.base-quality-encoding", "sanger"); + writer.setConf(conf); + + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(qual, fields[9]); + } + + @Test + public void testClose() throws IOException { + // doesn't really do anything but exercise the code + writer.close(null); + } + + @Test + public void testNoIndex() throws IOException { + fragment.setIndexSequence(null); + writer.write(null, fragment); + writer.close(null); + + String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); + assertEquals(11, fields.length); + + assertEquals("0", fields[6]); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java index 679b56f..405e716 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestSAMFormat.java @@ -1,12 +1,12 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + import java.io.IOException; import java.io.InputStream; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; - public class TestSAMFormat { @Test @@ -22,7 +22,7 @@ public void testInferFromData() throws IOException { assertEquals(SAMFormat.SAM, SAMFormat.inferFromData(stream("test.sam"))); assertEquals(SAMFormat.BAM, SAMFormat.inferFromData(stream("test.bam"))); assertEquals(SAMFormat.CRAM, SAMFormat.inferFromData(stream("test.cram"))); - assertNull( SAMFormat.inferFromData(stream("test.vcf"))); + assertNull(SAMFormat.inferFromData(stream("test.vcf"))); } private InputStream stream(String resource) throws IOException { diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java b/src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java index 15623c3..f84012c 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestSAMHeaderReader.java @@ -1,64 +1,65 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; + import htsjdk.samtools.*; -import htsjdk.samtools.cram.CRAMException; +import java.io.InputStream; +import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; import org.seqdoop.hadoop_bam.util.SAMHeaderReader; -import java.io.InputStream; -import java.net.URI; - -import static org.junit.Assert.assertEquals; - public class TestSAMHeaderReader { - @Rule - public ExpectedException thrown= ExpectedException.none(); - @Test - public void testBAMHeaderReaderNoReference() throws Exception { + @Rule public ExpectedException thrown = ExpectedException.none(); - final Configuration conf = new Configuration(); + @Test + public void testBAMHeaderReaderNoReference() throws Exception { - InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); - final SamReader samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream)); - int sequenceCount = samReader.getFileHeader().getSequenceDictionary().size(); - samReader.close(); + final Configuration conf = new Configuration(); - inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); - SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); - inputStream.close(); + InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); + final SamReader samReader = + SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream)); + int sequenceCount = samReader.getFileHeader().getSequenceDictionary().size(); + samReader.close(); - assertEquals(samHeader.getSequenceDictionary().size(), sequenceCount); - } + inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.bam"); + SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); + inputStream.close(); - @Test - public void testCRAMHeaderReaderWithReference() throws Exception { - final Configuration conf = new Configuration(); + assertEquals(samHeader.getSequenceDictionary().size(), sequenceCount); + } - final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); - final URI reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI(); - conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, reference.toString()); + @Test + public void testCRAMHeaderReaderWithReference() throws Exception { + final Configuration conf = new Configuration(); - SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); - inputStream.close(); + final InputStream inputStream = + ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); + final URI reference = ClassLoader.getSystemClassLoader().getResource("auxf.fa").toURI(); + conf.set(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY, reference.toString()); - assertEquals(samHeader.getSequenceDictionary().size(), 1); - } + SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); + inputStream.close(); - @Test - public void testCRAMHeaderReaderNoReference() throws Exception { + assertEquals(samHeader.getSequenceDictionary().size(), 1); + } - thrown.expect(IllegalStateException.class); // htsjdk throws on CRAM file with no reference provided + @Test + public void testCRAMHeaderReaderNoReference() throws Exception { - final Configuration conf = new Configuration(); - final InputStream inputStream = ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); - SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); - inputStream.close(); + thrown.expect( + IllegalStateException.class); // htsjdk throws on CRAM file with no reference provided - assertEquals(samHeader.getSequenceDictionary().size(), 1); - } + final Configuration conf = new Configuration(); + final InputStream inputStream = + ClassLoader.getSystemClassLoader().getResourceAsStream("test.cram"); + SAMFileHeader samHeader = SAMHeaderReader.readSAMHeaderFrom(inputStream, conf); + inputStream.close(); + assertEquals(samHeader.getSequenceDictionary().size(), 1); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java index f1ac677..d3bf93e 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestSAMInputFormat.java @@ -1,5 +1,9 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; @@ -25,11 +29,8 @@ import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - public class TestSAMInputFormat { + private String input; private TaskAttemptContext taskAttemptContext; private JobContext jobContext; @@ -56,8 +57,8 @@ public void testReader() throws Exception { AnySAMInputFormat inputFormat = new AnySAMInputFormat(); List splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); - RecordReader reader = inputFormat - .createRecordReader(splits.get(0), taskAttemptContext); + RecordReader reader = + inputFormat.createRecordReader(splits.get(0), taskAttemptContext); reader.initialize(splits.get(0), taskAttemptContext); int actualCount = 0; diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestSequencedFragment.java b/src/test/java/org/seqdoop/hadoop_bam/TestSequencedFragment.java index 8d049b0..bf85222 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestSequencedFragment.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestSequencedFragment.java @@ -22,341 +22,335 @@ package org.seqdoop.hadoop_bam; -import org.junit.*; import static org.junit.Assert.*; -import org.seqdoop.hadoop_bam.SequencedFragment; -import org.seqdoop.hadoop_bam.FormatConstants; -import org.seqdoop.hadoop_bam.FormatException; - -import java.io.IOException; -import java.io.DataInput; import java.io.ByteArrayInputStream; -import java.io.DataInputStream; - -import java.io.DataOutput; import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; import java.io.DataOutputStream; - +import java.io.IOException; import org.apache.hadoop.io.Text; +import org.junit.*; -public class TestSequencedFragment -{ - private SequencedFragment frag; - private SequencedFragment frag2; - - @Before - public void setup() - { - frag = new SequencedFragment(); - frag2 = new SequencedFragment(); - } - - @Test - public void testInitialState() - { - assertNotNull(frag.getSequence()); - assertNotNull(frag.getQuality()); - - assertNull(frag.getInstrument()); - assertNull(frag.getRunNumber()); - assertNull(frag.getFlowcellId()); - assertNull(frag.getLane()); - assertNull(frag.getTile()); - assertNull(frag.getXpos()); - assertNull(frag.getYpos()); - assertNull(frag.getRead()); - assertNull(frag.getFilterPassed()); - assertNull(frag.getControlNumber()); - assertNull(frag.getIndexSequence()); - - assertNotNull(frag.toString()); - } - - @Test(expected=IllegalArgumentException.class) - public void testNoNullSequence() - { - frag.setSequence(null); - } - - @Test(expected=IllegalArgumentException.class) - public void testNoNullQuality() - { - frag.setQuality(null); - } - - /////////////////////////////////////////////////////////////// - // equals - /////////////////////////////////////////////////////////////// - @Test - public void testEquals() - { - assertTrue(frag.equals(frag2)); - - frag.getSequence().append("AAAA".getBytes(), 0, 4); - assertFalse( frag.equals(frag2) ); - } - - @Test - public void testEqualsSequence() - { - frag.getSequence().append("AAAA".getBytes(), 0, 4); - assertFalse( frag.equals(frag2) ); - frag2.getSequence().append("AAAA".getBytes(), 0, 4); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsQuality() - { - frag.getQuality().append("AAAA".getBytes(), 0, 4); - assertFalse( frag.equals(frag2) ); - frag2.getQuality().append("AAAA".getBytes(), 0, 4); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsInstrument() - { - frag.setInstrument("instrument"); - assertFalse( frag.equals(frag2) ); - frag2.setInstrument("instrument"); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsRunNumber() - { - frag.setRunNumber(240); - assertFalse( frag.equals(frag2) ); - frag2.setRunNumber(240); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsFlowcellId() - { - frag.setFlowcellId("id"); - assertFalse( frag.equals(frag2) ); - frag2.setFlowcellId("id"); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsLane() - { - frag.setLane(2); - assertFalse( frag.equals(frag2) ); - frag2.setLane(2); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsTile() - { - frag.setTile(1000); - assertFalse( frag.equals(frag2) ); - frag2.setTile(1000); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsXpos() - { - frag.setXpos(1234); - assertFalse( frag.equals(frag2) ); - frag2.setXpos(1234); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsYpos() - { - frag.setYpos(1234); - assertFalse( frag.equals(frag2) ); - frag2.setYpos(1234); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsRead() - { - frag.setRead(2); - assertFalse( frag.equals(frag2) ); - frag2.setRead(2); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsFilterPassed() - { - frag.setFilterPassed(false); - assertFalse( frag.equals(frag2) ); - frag2.setFilterPassed(false); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsControlNumber() - { - frag.setControlNumber(314); - assertFalse( frag.equals(frag2) ); - frag2.setControlNumber(314); - assertTrue( frag.equals(frag2) ); - } - - @Test - public void testEqualsIndexSequence() - { - frag.setIndexSequence("ABC"); - assertFalse( frag.equals(frag2) ); - frag2.setIndexSequence("ABC"); - assertTrue( frag.equals(frag2) ); - } - - /////////////////////////////////////////////////////////////// - // serialization - /////////////////////////////////////////////////////////////// - private static SequencedFragment cloneBySerialization(SequencedFragment original) throws IOException - { - ByteArrayOutputStream outputBuffer = new ByteArrayOutputStream(); - DataOutputStream dataOutput = new DataOutputStream(outputBuffer); - original.write(dataOutput); - dataOutput.close(); - - SequencedFragment newFrag = new SequencedFragment(); - newFrag.readFields( new DataInputStream( new ByteArrayInputStream(outputBuffer.toByteArray()))); - - return newFrag; - } - - @Test - public void testSerializationEmpty() throws IOException - { - assertEquals(frag, cloneBySerialization(frag)); - } - - @Test - public void testSerializationWithSeq() throws IOException - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("##############################")); - assertEquals(frag, cloneBySerialization(frag)); - } - - @Test - public void testSerializationWithFields() throws IOException - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")); - - frag.setInstrument("machine"); - frag.setLane(3); - frag.setRead(1); - frag.setIndexSequence("CAT"); - - assertEquals(frag, cloneBySerialization(frag)); - } - - @Test - public void testToString() - { - String seq = "AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT"; - String qual = "##############################"; - frag.setSequence(new Text(seq)); - frag.setQuality(new Text(qual)); - - frag.setInstrument("machine"); - frag.setRunNumber(123); - frag.setFlowcellId("flowcell"); - frag.setLane(3); - frag.setTile(1001); - frag.setXpos(1234); - frag.setYpos(4321); - frag.setIndexSequence("CAT"); - frag.setRead(1); - - assertEquals("machine\t123\tflowcell\t3\t1001\t1234\t4321\tCAT\t1\t" + seq + "\t" + qual + "\t1", frag.toString()); - } - - @Test - public void testVerifyQualitySangerOk() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("##############################")); - assertEquals(-1, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); - } - - @Test - public void testVerifyQualityIlluminaOk() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz")); - assertEquals(-1, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina)); - } - - @Test - public void testVerifyQualitySangerOutOfRange() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("#############################" + Character.toString((char)127))); // over range - assertEquals(29, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); - - frag.setQuality(new Text("##### ########################")); // under range - assertEquals(5, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); - } - - @Test - public void testVerifyQualityIlluminaOutOfRange() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("zzz=zzzzzzzzzzzzzzzzzzzzzzzzzz")); - assertEquals(3, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina)); - } - - @Test - public void testConvertQualityIlluminaToSanger() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz")); - SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina, FormatConstants.BaseQualityEncoding.Sanger); - assertEquals("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", frag.getQuality().toString()); - } - - @Test - public void testConvertQualitySangerToIllumina() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")); - SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger, FormatConstants.BaseQualityEncoding.Illumina); - assertEquals("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", frag.getQuality().toString()); - } - - @Test(expected=IllegalArgumentException.class) - public void testConvertQualityNoop() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")); - SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger, FormatConstants.BaseQualityEncoding.Sanger); - } - - @Test(expected=FormatException.class) - public void testConvertQualityIlluminaOutOfRange() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("zzz=zzzzzzzzzzzzzzzzzzzzzzzzzz")); - SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina, FormatConstants.BaseQualityEncoding.Sanger); - } - - @Test(expected=FormatException.class) - public void testConvertQualitySangerUnderRange() - { - frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); - frag.setQuality(new Text("### ##########################")); - SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger, FormatConstants.BaseQualityEncoding.Illumina); - } - - public static void main(String args[]) { - org.junit.runner.JUnitCore.main(TestSequencedFragment.class.getName()); - } +public class TestSequencedFragment { + + private SequencedFragment frag; + private SequencedFragment frag2; + + /////////////////////////////////////////////////////////////// + // serialization + /////////////////////////////////////////////////////////////// + private static SequencedFragment cloneBySerialization(SequencedFragment original) + throws IOException { + ByteArrayOutputStream outputBuffer = new ByteArrayOutputStream(); + DataOutputStream dataOutput = new DataOutputStream(outputBuffer); + original.write(dataOutput); + dataOutput.close(); + + SequencedFragment newFrag = new SequencedFragment(); + newFrag.readFields(new DataInputStream(new ByteArrayInputStream(outputBuffer.toByteArray()))); + + return newFrag; + } + + public static void main(String args[]) { + org.junit.runner.JUnitCore.main(TestSequencedFragment.class.getName()); + } + + @Before + public void setup() { + frag = new SequencedFragment(); + frag2 = new SequencedFragment(); + } + + @Test + public void testInitialState() { + assertNotNull(frag.getSequence()); + assertNotNull(frag.getQuality()); + + assertNull(frag.getInstrument()); + assertNull(frag.getRunNumber()); + assertNull(frag.getFlowcellId()); + assertNull(frag.getLane()); + assertNull(frag.getTile()); + assertNull(frag.getXpos()); + assertNull(frag.getYpos()); + assertNull(frag.getRead()); + assertNull(frag.getFilterPassed()); + assertNull(frag.getControlNumber()); + assertNull(frag.getIndexSequence()); + + assertNotNull(frag.toString()); + } + + @Test(expected = IllegalArgumentException.class) + public void testNoNullSequence() { + frag.setSequence(null); + } + + @Test(expected = IllegalArgumentException.class) + public void testNoNullQuality() { + frag.setQuality(null); + } + + /////////////////////////////////////////////////////////////// + // equals + /////////////////////////////////////////////////////////////// + @Test + public void testEquals() { + assertTrue(frag.equals(frag2)); + + frag.getSequence().append("AAAA".getBytes(), 0, 4); + assertFalse(frag.equals(frag2)); + } + + @Test + public void testEqualsSequence() { + frag.getSequence().append("AAAA".getBytes(), 0, 4); + assertFalse(frag.equals(frag2)); + frag2.getSequence().append("AAAA".getBytes(), 0, 4); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsQuality() { + frag.getQuality().append("AAAA".getBytes(), 0, 4); + assertFalse(frag.equals(frag2)); + frag2.getQuality().append("AAAA".getBytes(), 0, 4); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsInstrument() { + frag.setInstrument("instrument"); + assertFalse(frag.equals(frag2)); + frag2.setInstrument("instrument"); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsRunNumber() { + frag.setRunNumber(240); + assertFalse(frag.equals(frag2)); + frag2.setRunNumber(240); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsFlowcellId() { + frag.setFlowcellId("id"); + assertFalse(frag.equals(frag2)); + frag2.setFlowcellId("id"); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsLane() { + frag.setLane(2); + assertFalse(frag.equals(frag2)); + frag2.setLane(2); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsTile() { + frag.setTile(1000); + assertFalse(frag.equals(frag2)); + frag2.setTile(1000); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsXpos() { + frag.setXpos(1234); + assertFalse(frag.equals(frag2)); + frag2.setXpos(1234); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsYpos() { + frag.setYpos(1234); + assertFalse(frag.equals(frag2)); + frag2.setYpos(1234); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsRead() { + frag.setRead(2); + assertFalse(frag.equals(frag2)); + frag2.setRead(2); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsFilterPassed() { + frag.setFilterPassed(false); + assertFalse(frag.equals(frag2)); + frag2.setFilterPassed(false); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsControlNumber() { + frag.setControlNumber(314); + assertFalse(frag.equals(frag2)); + frag2.setControlNumber(314); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testEqualsIndexSequence() { + frag.setIndexSequence("ABC"); + assertFalse(frag.equals(frag2)); + frag2.setIndexSequence("ABC"); + assertTrue(frag.equals(frag2)); + } + + @Test + public void testSerializationEmpty() throws IOException { + assertEquals(frag, cloneBySerialization(frag)); + } + + @Test + public void testSerializationWithSeq() throws IOException { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("##############################")); + assertEquals(frag, cloneBySerialization(frag)); + } + + @Test + public void testSerializationWithFields() throws IOException { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")); + + frag.setInstrument("machine"); + frag.setLane(3); + frag.setRead(1); + frag.setIndexSequence("CAT"); + + assertEquals(frag, cloneBySerialization(frag)); + } + + @Test + public void testToString() { + String seq = "AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT"; + String qual = "##############################"; + frag.setSequence(new Text(seq)); + frag.setQuality(new Text(qual)); + + frag.setInstrument("machine"); + frag.setRunNumber(123); + frag.setFlowcellId("flowcell"); + frag.setLane(3); + frag.setTile(1001); + frag.setXpos(1234); + frag.setYpos(4321); + frag.setIndexSequence("CAT"); + frag.setRead(1); + + assertEquals( + "machine\t123\tflowcell\t3\t1001\t1234\t4321\tCAT\t1\t" + seq + "\t" + qual + "\t1", + frag.toString()); + } + + @Test + public void testVerifyQualitySangerOk() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("##############################")); + assertEquals( + -1, + SequencedFragment.verifyQuality( + frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); + } + + @Test + public void testVerifyQualityIlluminaOk() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz")); + assertEquals( + -1, + SequencedFragment.verifyQuality( + frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina)); + } + + @Test + public void testVerifyQualitySangerOutOfRange() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality( + new Text("#############################" + Character.toString((char) 127))); // over range + assertEquals( + 29, + SequencedFragment.verifyQuality( + frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); + + frag.setQuality(new Text("##### ########################")); // under range + assertEquals( + 5, + SequencedFragment.verifyQuality( + frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); + } + + @Test + public void testVerifyQualityIlluminaOutOfRange() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("zzz=zzzzzzzzzzzzzzzzzzzzzzzzzz")); + assertEquals( + 3, + SequencedFragment.verifyQuality( + frag.getQuality(), FormatConstants.BaseQualityEncoding.Illumina)); + } + + @Test + public void testConvertQualityIlluminaToSanger() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz")); + SequencedFragment.convertQuality( + frag.getQuality(), + FormatConstants.BaseQualityEncoding.Illumina, + FormatConstants.BaseQualityEncoding.Sanger); + assertEquals("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[", frag.getQuality().toString()); + } + + @Test + public void testConvertQualitySangerToIllumina() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")); + SequencedFragment.convertQuality( + frag.getQuality(), + FormatConstants.BaseQualityEncoding.Sanger, + FormatConstants.BaseQualityEncoding.Illumina); + assertEquals("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", frag.getQuality().toString()); + } + + @Test(expected = IllegalArgumentException.class) + public void testConvertQualityNoop() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")); + SequencedFragment.convertQuality( + frag.getQuality(), + FormatConstants.BaseQualityEncoding.Sanger, + FormatConstants.BaseQualityEncoding.Sanger); + } + + @Test(expected = FormatException.class) + public void testConvertQualityIlluminaOutOfRange() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("zzz=zzzzzzzzzzzzzzzzzzzzzzzzzz")); + SequencedFragment.convertQuality( + frag.getQuality(), + FormatConstants.BaseQualityEncoding.Illumina, + FormatConstants.BaseQualityEncoding.Sanger); + } + + @Test(expected = FormatException.class) + public void testConvertQualitySangerUnderRange() { + frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); + frag.setQuality(new Text("### ##########################")); + SequencedFragment.convertQuality( + frag.getQuality(), + FormatConstants.BaseQualityEncoding.Sanger, + FormatConstants.BaseQualityEncoding.Illumina); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java b/src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java index d5c801e..c43b459 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestSplittingBAMIndexer.java @@ -1,5 +1,8 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; @@ -10,10 +13,8 @@ import org.junit.Before; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class TestSplittingBAMIndexer { + private String input; @Before @@ -24,7 +25,7 @@ public void setup() throws Exception { @Test public void testIndexersProduceSameIndexes() throws Exception { long bamFileSize = new File(input).length(); - for (int g : new int[] { 2, 10, SplittingBAMIndexer.DEFAULT_GRANULARITY}) { + for (int g : new int[] {2, 10, SplittingBAMIndexer.DEFAULT_GRANULARITY}) { SplittingBAMIndex index1 = fromBAMFile(g); SplittingBAMIndex index2 = fromSAMRecords(g); assertEquals(index1, index2); @@ -33,8 +34,7 @@ public void testIndexersProduceSameIndexes() throws Exception { } } - private SplittingBAMIndex fromBAMFile(int granularity) throws - IOException { + private SplittingBAMIndex fromBAMFile(int granularity) throws IOException { Configuration conf = new Configuration(); conf.set("input", new File(input).toURI().toString()); conf.setInt("granularity", granularity); @@ -51,8 +51,10 @@ private SplittingBAMIndex fromSAMRecords(int granularity) throws IOException { File indexFile = new File(input + SplittingBAMIndexer.OUTPUT_FILE_EXTENSION); FileOutputStream out = new FileOutputStream(indexFile); SplittingBAMIndexer indexer = new SplittingBAMIndexer(out, granularity); - SamReader samReader = SamReaderFactory.makeDefault() - .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS).open(new File(input)); + SamReader samReader = + SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS) + .open(new File(input)); for (SAMRecord r : samReader) { indexer.processAlignment(r); } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java index 0f7441a..3d24931 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestVCFFormat.java @@ -1,14 +1,12 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + import java.io.IOException; import java.io.InputStream; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; - public class TestVCFFormat { @Test diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormat.java index 12ca484..85cfae1 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormat.java @@ -20,17 +20,27 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; + import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import htsjdk.samtools.util.Interval; +import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFileReader; import java.io.File; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.Iterator; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.*; -import htsjdk.variant.variantcontext.VariantContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.task.JobContextImpl; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; @@ -38,155 +48,159 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; - -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; import org.seqdoop.hadoop_bam.util.BGZFCodec; import org.seqdoop.hadoop_bam.util.BGZFEnhancedGzipCodec; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.mock; - @RunWith(Parameterized.class) public class TestVCFInputFormat { - enum NUM_SPLITS { - ANY, EXACTLY_ONE, MORE_THAN_ONE - } - private String filename; - private NUM_SPLITS expectedSplits; - private Interval interval; - private VariantContextWritable writable; - private List> readers; - private TaskAttemptContext taskAttemptContext; - - public TestVCFInputFormat(String filename, NUM_SPLITS expectedSplits, Interval interval) { - this.filename = filename; - this.expectedSplits = expectedSplits; - this.interval = interval; - } - @Parameterized.Parameters - public static Collection data() { - return Arrays.asList(new Object[][] { - {"test.vcf", NUM_SPLITS.ANY, null}, - {"test.vcf.gz", NUM_SPLITS.EXACTLY_ONE, null}, - {"test.vcf.bgzf.gz", NUM_SPLITS.ANY, null}, - // BCF tests currently fail due to https://github.com/samtools/htsjdk/issues/507 -// {"test.uncompressed.bcf", NUM_SPLITS.ANY, null}, -// {"test.bgzf.bcf", NUM_SPLITS.ANY, null}, - {"HiSeq.10000.vcf", NUM_SPLITS.MORE_THAN_ONE, null}, - {"HiSeq.10000.vcf.gz", NUM_SPLITS.EXACTLY_ONE, null}, - {"HiSeq.10000.vcf.bgzf.gz", NUM_SPLITS.MORE_THAN_ONE, null}, - {"HiSeq.10000.vcf.bgzf.gz", NUM_SPLITS.EXACTLY_ONE, - new Interval("chr1", 2700000, 2800000)}, // chosen to fall in one split - {"HiSeq.10000.vcf.bgz", NUM_SPLITS.MORE_THAN_ONE, null}, - {"HiSeq.10000.vcf.bgz", NUM_SPLITS.EXACTLY_ONE, - new Interval("chr1", 2700000, 2800000)} // chosen to fall in one split + private String filename; + private NUM_SPLITS expectedSplits; + private Interval interval; + private VariantContextWritable writable; + private List> readers; + private TaskAttemptContext taskAttemptContext; + + public TestVCFInputFormat(String filename, NUM_SPLITS expectedSplits, Interval interval) { + this.filename = filename; + this.expectedSplits = expectedSplits; + this.interval = interval; + } + + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList( + new Object[][] { + {"test.vcf", NUM_SPLITS.ANY, null}, + {"test.vcf.gz", NUM_SPLITS.EXACTLY_ONE, null}, + {"test.vcf.bgzf.gz", NUM_SPLITS.ANY, null}, + // BCF tests currently fail due to https://github.com/samtools/htsjdk/issues/507 + // {"test.uncompressed.bcf", NUM_SPLITS.ANY, null}, + // {"test.bgzf.bcf", NUM_SPLITS.ANY, null}, + {"HiSeq.10000.vcf", NUM_SPLITS.MORE_THAN_ONE, null}, + {"HiSeq.10000.vcf.gz", NUM_SPLITS.EXACTLY_ONE, null}, + {"HiSeq.10000.vcf.bgzf.gz", NUM_SPLITS.MORE_THAN_ONE, null}, + { + "HiSeq.10000.vcf.bgzf.gz", + NUM_SPLITS.EXACTLY_ONE, + new Interval("chr1", 2700000, 2800000) + }, // chosen to fall in one split + {"HiSeq.10000.vcf.bgz", NUM_SPLITS.MORE_THAN_ONE, null}, + { + "HiSeq.10000.vcf.bgz", NUM_SPLITS.EXACTLY_ONE, new Interval("chr1", 2700000, 2800000) + } // chosen to fall in one split }); + } + + @Before + public void setup() + throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, + InstantiationException, InterruptedException, NoSuchFieldException { + Configuration conf = new Configuration(); + String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); + conf.set("hadoopbam.vcf.trust-exts", "true"); + conf.set("mapred.input.dir", "file://" + input_file); + conf.setStrings( + "io.compression.codecs", + BGZFEnhancedGzipCodec.class.getCanonicalName(), + BGZFCodec.class.getCanonicalName()); + conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 100 * 1024); // 100K + + if (interval != null) { + VCFInputFormat.setIntervals(conf, ImmutableList.of(interval)); } - @Before - public void setup() throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, InstantiationException, InterruptedException, NoSuchFieldException { - Configuration conf = new Configuration(); - String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); - conf.set("hadoopbam.vcf.trust-exts", "true"); - conf.set("mapred.input.dir", "file://" + input_file); - conf.setStrings("io.compression.codecs", BGZFEnhancedGzipCodec.class.getCanonicalName(), - BGZFCodec.class.getCanonicalName()); - conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 100 * 1024); // 100K - - if (interval != null) { - VCFInputFormat.setIntervals(conf, ImmutableList.of(interval)); - } - - taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); - JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); - - VCFInputFormat inputFormat = new VCFInputFormat(conf); - List splits = inputFormat.getSplits(ctx); - switch (expectedSplits) { - case EXACTLY_ONE: - assertEquals("Should be exactly one split", 1, splits.size()); - break; - case MORE_THAN_ONE: - assertTrue("Should be more than one split", splits.size() > 1); - break; - case ANY: - default: - break; - } - readers = new ArrayList<>(); - for (InputSplit split : splits) { - RecordReader reader = inputFormat.createRecordReader(split, taskAttemptContext); - reader.initialize(split, taskAttemptContext); - readers.add(reader); - } + taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); + JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); + + VCFInputFormat inputFormat = new VCFInputFormat(conf); + List splits = inputFormat.getSplits(ctx); + switch (expectedSplits) { + case EXACTLY_ONE: + assertEquals("Should be exactly one split", 1, splits.size()); + break; + case MORE_THAN_ONE: + assertTrue("Should be more than one split", splits.size() > 1); + break; + case ANY: + default: + break; } - - @Test - public void countEntries() throws Exception { - VCFFileReader vcfFileReader = - new VCFFileReader(new File("src/test/resources/" + filename), false); - Iterator variantIterator; - if (interval == null) { - variantIterator = vcfFileReader.iterator(); - } else { - variantIterator = vcfFileReader.query(interval.getContig(), - interval.getStart(), interval.getEnd()); - } - int expectedCount = Iterators.size(variantIterator); - - int counter = 0; - for (RecordReader reader : readers) { - while (reader.nextKeyValue()) { - writable = reader.getCurrentValue(); - assertNotNull(writable); - VariantContext vc = writable.get(); - assertNotNull(vc); - String value = vc.toString(); - assertNotNull(value); - counter++; - } - } - assertEquals(expectedCount, counter); + readers = new ArrayList<>(); + for (InputSplit split : splits) { + RecordReader reader = + inputFormat.createRecordReader(split, taskAttemptContext); + reader.initialize(split, taskAttemptContext); + readers.add(reader); } + } + + @Test + public void countEntries() throws Exception { + VCFFileReader vcfFileReader = + new VCFFileReader(new File("src/test/resources/" + filename), false); + Iterator variantIterator; + if (interval == null) { + variantIterator = vcfFileReader.iterator(); + } else { + variantIterator = + vcfFileReader.query(interval.getContig(), interval.getStart(), interval.getEnd()); + } + int expectedCount = Iterators.size(variantIterator); - @Test - public void testFirstSecond() throws Exception { - if (!filename.startsWith("test.")) { - return; - } - RecordReader reader = readers.get(0); - if (!reader.nextKeyValue()) - throw new Exception("could not read first VariantContext"); - + int counter = 0; + for (RecordReader reader : readers) { + while (reader.nextKeyValue()) { writable = reader.getCurrentValue(); assertNotNull(writable); VariantContext vc = writable.get(); assertNotNull(vc); + String value = vc.toString(); + assertNotNull(value); + counter++; + } + } + assertEquals(expectedCount, counter); + } - assertEquals("20", vc.getContig()); - assertEquals(14370, vc.getStart()); - assertEquals(14370, vc.getEnd()); - assertEquals("G", vc.getReference().getBaseString()); - assertEquals("A", vc.getAlternateAllele(0).getBaseString()); - - assertTrue("second VariantContext", reader.nextKeyValue()); - - writable = reader.getCurrentValue(); - assertNotNull(writable); - vc = writable.get(); - assertNotNull(vc); - - assertEquals("20", vc.getContig()); - assertEquals(17330, vc.getStart()); - assertEquals(17330, vc.getEnd()); - assertEquals("T", vc.getReference().getBaseString()); - assertEquals("A", vc.getAlternateAllele(0).getBaseString()); + @Test + public void testFirstSecond() throws Exception { + if (!filename.startsWith("test.")) { + return; } + RecordReader reader = readers.get(0); + if (!reader.nextKeyValue()) { + throw new Exception("could not read first VariantContext"); + } + + writable = reader.getCurrentValue(); + assertNotNull(writable); + VariantContext vc = writable.get(); + assertNotNull(vc); + + assertEquals("20", vc.getContig()); + assertEquals(14370, vc.getStart()); + assertEquals(14370, vc.getEnd()); + assertEquals("G", vc.getReference().getBaseString()); + assertEquals("A", vc.getAlternateAllele(0).getBaseString()); + + assertTrue("second VariantContext", reader.nextKeyValue()); + + writable = reader.getCurrentValue(); + assertNotNull(writable); + vc = writable.get(); + assertNotNull(vc); + + assertEquals("20", vc.getContig()); + assertEquals(17330, vc.getStart()); + assertEquals(17330, vc.getEnd()); + assertEquals("T", vc.getReference().getBaseString()); + assertEquals("A", vc.getAlternateAllele(0).getBaseString()); + } + + enum NUM_SPLITS { + ANY, + EXACTLY_ONE, + MORE_THAN_ONE + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java b/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java index a2f381c..7578191 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestVCFInputFormatStringency.java @@ -20,6 +20,10 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mockito.Mockito.mock; + import htsjdk.samtools.ValidationStringency; import htsjdk.tribble.TribbleException; import htsjdk.variant.variantcontext.VariantContext; @@ -35,65 +39,62 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.junit.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.mockito.Mockito.mock; - public class TestVCFInputFormatStringency { - public void checkReading(ValidationStringency validationStringency) throws Exception { - String filename = "invalid_info_field.vcf"; - Configuration conf = new Configuration(); - String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); - conf.set("mapred.input.dir", "file://" + input_file); + public void checkReading(ValidationStringency validationStringency) throws Exception { + String filename = "invalid_info_field.vcf"; + Configuration conf = new Configuration(); + String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); + conf.set("mapred.input.dir", "file://" + input_file); - if (validationStringency != null) { - VCFRecordReader.setValidationStringency(conf, validationStringency); - } + if (validationStringency != null) { + VCFRecordReader.setValidationStringency(conf, validationStringency); + } - TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); - JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); + TaskAttemptContext taskAttemptContext = + new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); + JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); - VCFInputFormat inputFormat = new VCFInputFormat(conf); - List splits = inputFormat.getSplits(ctx); - assertEquals(1, splits.size()); - RecordReader reader = - inputFormat.createRecordReader(splits.get(0), taskAttemptContext); - int counter = 0; - while (reader.nextKeyValue()) { - VariantContextWritable writable = reader.getCurrentValue(); - assertNotNull(writable); - VariantContext vc = writable.get(); - assertNotNull(vc); - String value = vc.toString(); - assertNotNull(value); - counter++; - } - assertEquals(4, counter); + VCFInputFormat inputFormat = new VCFInputFormat(conf); + List splits = inputFormat.getSplits(ctx); + assertEquals(1, splits.size()); + RecordReader reader = + inputFormat.createRecordReader(splits.get(0), taskAttemptContext); + int counter = 0; + while (reader.nextKeyValue()) { + VariantContextWritable writable = reader.getCurrentValue(); + assertNotNull(writable); + VariantContext vc = writable.get(); + assertNotNull(vc); + String value = vc.toString(); + assertNotNull(value); + counter++; } + assertEquals(4, counter); + } - @Test(expected = TribbleException.class) - public void testUnset() throws Exception { - checkReading(null); // defaults to strict - } + @Test(expected = TribbleException.class) + public void testUnset() throws Exception { + checkReading(null); // defaults to strict + } - @Test(expected = TribbleException.class) - public void testDefault() throws Exception { - checkReading(ValidationStringency.DEFAULT_STRINGENCY); // defaults to strict - } + @Test(expected = TribbleException.class) + public void testDefault() throws Exception { + checkReading(ValidationStringency.DEFAULT_STRINGENCY); // defaults to strict + } - @Test - public void testSilent() throws Exception { - checkReading(ValidationStringency.SILENT); - } + @Test + public void testSilent() throws Exception { + checkReading(ValidationStringency.SILENT); + } - @Test - public void testLenient() throws Exception { - checkReading(ValidationStringency.LENIENT); - } + @Test + public void testLenient() throws Exception { + checkReading(ValidationStringency.LENIENT); + } - @Test(expected = TribbleException.class) - public void testStrict() throws Exception { - checkReading(ValidationStringency.STRICT); - } + @Test(expected = TribbleException.class) + public void testStrict() throws Exception { + checkReading(ValidationStringency.STRICT); + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestVCFOutputFormat.java b/src/test/java/org/seqdoop/hadoop_bam/TestVCFOutputFormat.java index 4002f1a..f698394 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestVCFOutputFormat.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestVCFOutputFormat.java @@ -20,14 +20,14 @@ package org.seqdoop.hadoop_bam; -import java.io.*; -import java.lang.reflect.InvocationTargetException; -import java.util.*; +import static org.mockito.Mockito.mock; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.*; - +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.util.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; @@ -35,150 +35,163 @@ import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; - import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; - import org.seqdoop.hadoop_bam.util.VCFHeaderReader; -import static org.mockito.Mockito.mock; - public class TestVCFOutputFormat { - private VariantContextWritable writable; - private RecordWriter writer; - private TaskAttemptContext taskAttemptContext; - private File test_vcf_output; - - @Before - public void setup() throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, InstantiationException { - test_vcf_output = File.createTempFile("test_vcf_output", ""); - test_vcf_output.delete(); - writable = new VariantContextWritable(); - Configuration conf = new Configuration(); - conf.set("hadoopbam.vcf.output-format", "VCF"); - KeyIgnoringVCFOutputFormat outputFormat = new KeyIgnoringVCFOutputFormat(conf); - outputFormat.setHeader(readHeader()); - taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); - writer = outputFormat.getRecordWriter(taskAttemptContext, new Path("file://" + test_vcf_output)); - } - @After - public void cleanup() throws IOException { - FileUtil.fullyDelete(test_vcf_output); + private VariantContextWritable writable; + private RecordWriter writer; + private TaskAttemptContext taskAttemptContext; + private File test_vcf_output; + + @Before + public void setup() + throws IOException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, + InstantiationException { + test_vcf_output = File.createTempFile("test_vcf_output", ""); + test_vcf_output.delete(); + writable = new VariantContextWritable(); + Configuration conf = new Configuration(); + conf.set("hadoopbam.vcf.output-format", "VCF"); + KeyIgnoringVCFOutputFormat outputFormat = new KeyIgnoringVCFOutputFormat(conf); + outputFormat.setHeader(readHeader()); + taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); + writer = + outputFormat.getRecordWriter(taskAttemptContext, new Path("file://" + test_vcf_output)); + } + + @After + public void cleanup() throws IOException { + FileUtil.fullyDelete(test_vcf_output); + } + + private void skipHeader(LineNumberReader reader) throws IOException { + String line = reader.readLine(); + while (line.startsWith("#")) { + reader.mark(1000); + line = reader.readLine(); } - - private void skipHeader(LineNumberReader reader) throws IOException { - String line = reader.readLine(); - while (line.startsWith("#")) { - reader.mark(1000); - line = reader.readLine(); - } - reader.reset(); - } - - @Test - public void testSimple() throws Exception { - VariantContextBuilder vctx_builder = new VariantContextBuilder(); - - ArrayList alleles = new ArrayList(); - alleles.add(Allele.create("A", false)); - alleles.add(Allele.create("C", true)); - vctx_builder.alleles(alleles); - - GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES; - vctx_builder.genotypes(genotypes); - - HashSet filters = new HashSet(); - vctx_builder.filters(filters); - - HashMap attributes = new HashMap(); - attributes.put("NS", new Integer(4)); - vctx_builder.attributes(attributes); - - vctx_builder.loc("20", 2, 2); - vctx_builder.log10PError(-8.0); - - String[] expected = new String[]{"20", "2", ".", "C", "A", "80", "PASS", "NS=4"}; - - VariantContext ctx = vctx_builder.make(); - writable.set(ctx); - writer.write(1L, writable); - writer.close(taskAttemptContext); - - LineNumberReader reader = new LineNumberReader(new FileReader(test_vcf_output)); - skipHeader(reader); - String[] fields = Arrays.copyOf(reader.readLine().split("\t"), expected.length); - Assert.assertArrayEquals("comparing VCF single line", expected, fields); - } - - @Test - public void testVariantContextReadWrite() throws IOException, InterruptedException - { - // This is to check whether issue https://github.com/HadoopGenomics/Hadoop-BAM/issues/1 has been - // resolved - VariantContextBuilder vctx_builder = new VariantContextBuilder(); - - ArrayList alleles = new ArrayList(); - alleles.add(Allele.create("C", false)); - alleles.add(Allele.create("G", true)); - vctx_builder.alleles(alleles); - - ArrayList genotypes = new ArrayList(); - GenotypeBuilder builder = new GenotypeBuilder(); - genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00001").GQ(48).DP(1).make()); - genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00002").GQ(42).DP(2).make()); - genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00003").GQ(39).DP(3).make()); - vctx_builder.genotypes(genotypes); - - HashSet filters = new HashSet(); - vctx_builder.filters(filters); - - HashMap attributes = new HashMap(); - attributes.put("NS", new Integer(4)); - vctx_builder.attributes(attributes); - - vctx_builder.loc("20", 2, 2); - vctx_builder.log10PError(-8.0); - - VariantContext ctx = vctx_builder.make(); - VariantContextWithHeader ctxh = new VariantContextWithHeader(ctx, readHeader()); - writable.set(ctxh); - - DataOutputBuffer out = new DataOutputBuffer(1000); - writable.write(out); - - byte[] data = out.getData(); - ByteArrayInputStream bis = new ByteArrayInputStream(data); - - writable = new VariantContextWritable(); - writable.readFields(new DataInputStream(bis)); - - VariantContext vc = writable.get(); - Assert.assertArrayEquals("comparing Alleles",ctx.getAlleles().toArray(),vc.getAlleles().toArray()); - Assert.assertEquals("comparing Log10PError",ctx.getLog10PError(),vc.getLog10PError(),0.01); - Assert.assertArrayEquals("comparing Filters",ctx.getFilters().toArray(),vc.getFilters().toArray()); - Assert.assertEquals("comparing Attributes",ctx.getAttributes(),vc.getAttributes()); - - // Now check the genotypes. Note: we need to make the header accessible before decoding the genotypes. - GenotypesContext gc = vc.getGenotypes(); - assert(gc instanceof LazyVCFGenotypesContext); - LazyVCFGenotypesContext.HeaderDataCache headerDataCache = new LazyVCFGenotypesContext.HeaderDataCache(); - headerDataCache.setHeader(readHeader()); - ((LazyVCFGenotypesContext) gc).getParser().setHeaderDataCache(headerDataCache); - - for (Genotype genotype : genotypes) { - Assert.assertEquals("checking genotype name", genotype.getSampleName(), gc.get(genotypes.indexOf(genotype)).getSampleName()); - Assert.assertEquals("checking genotype quality", genotype.getGQ(), gc.get(genotypes.indexOf(genotype)).getGQ()); - Assert.assertEquals("checking genotype read depth", genotype.getDP(), gc.get(genotypes.indexOf(genotype)).getDP()); - } - } - - private VCFHeader readHeader() throws IOException { - String header_file = ClassLoader.getSystemClassLoader().getResource("test.vcf").getFile(); - VCFHeader header = VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(header_file))); - return header; + reader.reset(); + } + + @Test + public void testSimple() throws Exception { + VariantContextBuilder vctx_builder = new VariantContextBuilder(); + + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create("A", false)); + alleles.add(Allele.create("C", true)); + vctx_builder.alleles(alleles); + + GenotypesContext genotypes = GenotypesContext.NO_GENOTYPES; + vctx_builder.genotypes(genotypes); + + HashSet filters = new HashSet(); + vctx_builder.filters(filters); + + HashMap attributes = new HashMap(); + attributes.put("NS", new Integer(4)); + vctx_builder.attributes(attributes); + + vctx_builder.loc("20", 2, 2); + vctx_builder.log10PError(-8.0); + + String[] expected = new String[] {"20", "2", ".", "C", "A", "80", "PASS", "NS=4"}; + + VariantContext ctx = vctx_builder.make(); + writable.set(ctx); + writer.write(1L, writable); + writer.close(taskAttemptContext); + + LineNumberReader reader = new LineNumberReader(new FileReader(test_vcf_output)); + skipHeader(reader); + String[] fields = Arrays.copyOf(reader.readLine().split("\t"), expected.length); + Assert.assertArrayEquals("comparing VCF single line", expected, fields); + } + + @Test + public void testVariantContextReadWrite() throws IOException, InterruptedException { + // This is to check whether issue https://github.com/HadoopGenomics/Hadoop-BAM/issues/1 has been + // resolved + VariantContextBuilder vctx_builder = new VariantContextBuilder(); + + ArrayList alleles = new ArrayList(); + alleles.add(Allele.create("C", false)); + alleles.add(Allele.create("G", true)); + vctx_builder.alleles(alleles); + + ArrayList genotypes = new ArrayList(); + GenotypeBuilder builder = new GenotypeBuilder(); + genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00001").GQ(48).DP(1).make()); + genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00002").GQ(42).DP(2).make()); + genotypes.add(builder.alleles(alleles.subList(0, 1)).name("NA00003").GQ(39).DP(3).make()); + vctx_builder.genotypes(genotypes); + + HashSet filters = new HashSet(); + vctx_builder.filters(filters); + + HashMap attributes = new HashMap(); + attributes.put("NS", new Integer(4)); + vctx_builder.attributes(attributes); + + vctx_builder.loc("20", 2, 2); + vctx_builder.log10PError(-8.0); + + VariantContext ctx = vctx_builder.make(); + VariantContextWithHeader ctxh = new VariantContextWithHeader(ctx, readHeader()); + writable.set(ctxh); + + DataOutputBuffer out = new DataOutputBuffer(1000); + writable.write(out); + + byte[] data = out.getData(); + ByteArrayInputStream bis = new ByteArrayInputStream(data); + + writable = new VariantContextWritable(); + writable.readFields(new DataInputStream(bis)); + + VariantContext vc = writable.get(); + Assert.assertArrayEquals( + "comparing Alleles", ctx.getAlleles().toArray(), vc.getAlleles().toArray()); + Assert.assertEquals("comparing Log10PError", ctx.getLog10PError(), vc.getLog10PError(), 0.01); + Assert.assertArrayEquals( + "comparing Filters", ctx.getFilters().toArray(), vc.getFilters().toArray()); + Assert.assertEquals("comparing Attributes", ctx.getAttributes(), vc.getAttributes()); + + // Now check the genotypes. Note: we need to make the header accessible before decoding the + // genotypes. + GenotypesContext gc = vc.getGenotypes(); + assert (gc instanceof LazyVCFGenotypesContext); + LazyVCFGenotypesContext.HeaderDataCache headerDataCache = + new LazyVCFGenotypesContext.HeaderDataCache(); + headerDataCache.setHeader(readHeader()); + ((LazyVCFGenotypesContext) gc).getParser().setHeaderDataCache(headerDataCache); + + for (Genotype genotype : genotypes) { + Assert.assertEquals( + "checking genotype name", + genotype.getSampleName(), + gc.get(genotypes.indexOf(genotype)).getSampleName()); + Assert.assertEquals( + "checking genotype quality", + genotype.getGQ(), + gc.get(genotypes.indexOf(genotype)).getGQ()); + Assert.assertEquals( + "checking genotype read depth", + genotype.getDP(), + gc.get(genotypes.indexOf(genotype)).getDP()); } + } + + private VCFHeader readHeader() throws IOException { + String header_file = ClassLoader.getSystemClassLoader().getResource("test.vcf").getFile(); + VCFHeader header = + VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(header_file))); + return header; + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/TestVCFRoundTrip.java b/src/test/java/org/seqdoop/hadoop_bam/TestVCFRoundTrip.java index 0f61172..ca58f31 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/TestVCFRoundTrip.java +++ b/src/test/java/org/seqdoop/hadoop_bam/TestVCFRoundTrip.java @@ -20,6 +20,9 @@ package org.seqdoop.hadoop_bam; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import com.google.common.collect.Iterators; import com.google.common.io.Files; import htsjdk.samtools.seekablestream.SeekableFileStream; @@ -57,211 +60,209 @@ import org.seqdoop.hadoop_bam.util.VCFFileMerger; import org.seqdoop.hadoop_bam.util.VCFHeaderReader; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - @RunWith(Parameterized.class) public class TestVCFRoundTrip { - // VCF output format that writes a header before records - static class VCFTestWithHeaderOutputFormat - extends KeyIgnoringVCFOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestVCF.header"; - - public VCFTestWithHeaderOutputFormat() { - super(VCFFormat.VCF); - } - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx) throws IOException { - Path vcfPath = new Path(conf.get(READ_HEADER_FROM_FILE)); - readHeaderFrom(vcfPath, vcfPath.getFileSystem(conf)); - return super.getRecordWriter(ctx); - } + private static Configuration conf; + private String testVCFFileName; + private Class codecClass; + private NUM_SPLITS expectedSplits; + + public TestVCFRoundTrip( + String filename, Class codecClass, NUM_SPLITS expectedSplits) { + testVCFFileName = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); + this.codecClass = codecClass; + this.expectedSplits = expectedSplits; + } + + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList( + new Object[][] { + {"test.vcf", null, NUM_SPLITS.ANY}, + {"test.vcf.gz", BGZFEnhancedGzipCodec.class, NUM_SPLITS.EXACTLY_ONE}, + {"test.vcf.bgzf.gz", BGZFCodec.class, NUM_SPLITS.ANY}, + {"test.vcf.bgz", BGZFCodec.class, NUM_SPLITS.ANY}, + {"HiSeq.10000.vcf", null, NUM_SPLITS.MORE_THAN_ONE}, + {"HiSeq.10000.vcf.gz", BGZFEnhancedGzipCodec.class, NUM_SPLITS.EXACTLY_ONE}, + {"HiSeq.10000.vcf.bgzf.gz", BGZFCodec.class, NUM_SPLITS.MORE_THAN_ONE}, + {"HiSeq.10000.vcf.bgz", BGZFCodec.class, NUM_SPLITS.MORE_THAN_ONE} + }); + } + + private static VCFFileReader parseVcf(File vcf) throws IOException { + File actualVcf; + // work around TribbleIndexedFeatureReader not reading header from .bgz files + if (vcf.getName().endsWith(".bgz")) { + actualVcf = File.createTempFile(vcf.getName(), ".gz"); + actualVcf.deleteOnExit(); + Files.copy(vcf, actualVcf); + } else { + actualVcf = vcf; + } + return new VCFFileReader(actualVcf, false); + } + + @Before + public void setup() throws Exception { + conf = new Configuration(); + conf.set(VCFTestWithHeaderOutputFormat.READ_HEADER_FROM_FILE, testVCFFileName); + conf.setStrings( + "io.compression.codecs", + BGZFCodec.class.getCanonicalName(), + BGZFEnhancedGzipCodec.class.getCanonicalName()); + conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 100 * 1024); // 100K + } + + @Test + public void testRoundTrip() throws Exception { + Path vcfPath = new Path("file://" + testVCFFileName); + + // run a MR job to write out a VCF file + Path outputPath = doMapReduce(vcfPath, true); + + // verify the output is the same as the input + List expectedVariants = new ArrayList<>(); + VCFFileReader vcfFileReader = parseVcf(new File(testVCFFileName)); + Iterators.addAll(expectedVariants, vcfFileReader.iterator()); + + int splits = 0; + List actualVariants = new ArrayList<>(); + File[] vcfFiles = + new File(outputPath.toUri()) + .listFiles( + pathname -> + (!pathname.getName().startsWith(".") && !pathname.getName().startsWith("_"))); + Arrays.sort(vcfFiles); // ensure files are sorted by name + for (File vcf : vcfFiles) { + splits++; + Iterators.addAll(actualVariants, parseVcf(vcf).iterator()); + if (BGZFCodec.class.equals(codecClass)) { + assertTrue( + BlockCompressedInputStream.isValidFile( + new BufferedInputStream(new FileInputStream(vcf)))); + } else if (BGZFEnhancedGzipCodec.class.equals(codecClass)) { + assertTrue(VCFFormat.isGzip(new BufferedInputStream(new FileInputStream(vcf)))); + } } - // VCF output format that doesn't write a header before records - static class VCFTestNoHeaderOutputFormat - extends KeyIgnoringVCFOutputFormat { - public final static String READ_HEADER_FROM_FILE = "TestVCF.header"; - - public VCFTestNoHeaderOutputFormat() { - super(VCFFormat.VCF); - } - - @Override - public RecordWriter getRecordWriter( - TaskAttemptContext ctx) throws IOException { - Path vcfPath = new Path(conf.get(READ_HEADER_FROM_FILE)); - readHeaderFrom(vcfPath, vcfPath.getFileSystem(conf)); - ctx.getConfiguration().setBoolean(WRITE_HEADER_PROPERTY, false); - return super.getRecordWriter(ctx); - } + switch (expectedSplits) { + case EXACTLY_ONE: + assertEquals("Should be exactly one split", 1, splits); + break; + case MORE_THAN_ONE: + assertTrue("Should be more than one split", splits > 1); + break; + case ANY: + default: + break; } - @Parameterized.Parameters - public static Collection data() { - return Arrays.asList(new Object[][] { - {"test.vcf", null, NUM_SPLITS.ANY}, - {"test.vcf.gz", BGZFEnhancedGzipCodec.class, NUM_SPLITS.EXACTLY_ONE}, - {"test.vcf.bgzf.gz", BGZFCodec.class, NUM_SPLITS.ANY}, - {"test.vcf.bgz", BGZFCodec.class, NUM_SPLITS.ANY}, - {"HiSeq.10000.vcf", null, NUM_SPLITS.MORE_THAN_ONE}, - {"HiSeq.10000.vcf.gz", BGZFEnhancedGzipCodec.class, NUM_SPLITS.EXACTLY_ONE}, - {"HiSeq.10000.vcf.bgzf.gz", BGZFCodec.class, NUM_SPLITS.MORE_THAN_ONE}, - {"HiSeq.10000.vcf.bgz", BGZFCodec.class, NUM_SPLITS.MORE_THAN_ONE} - }); + // use a VariantContextComparator to check variants are equal + VCFHeader vcfHeader = + VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(testVCFFileName))); + VariantContextComparator vcfRecordComparator = vcfHeader.getVCFRecordComparator(); + assertEquals(expectedVariants.size(), actualVariants.size()); + for (int i = 0; i < expectedVariants.size(); i++) { + assertEquals(0, vcfRecordComparator.compare(expectedVariants.get(i), actualVariants.get(i))); + } + } + + @Test + public void testRoundTripWithMerge() throws Exception { + Path vcfPath = new Path("file://" + testVCFFileName); + + // run a MR job to write out a VCF file + Path outputPath = doMapReduce(vcfPath, false); + + // merge the output + VCFHeader vcfHeader = + VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(testVCFFileName))); + final File outFile = + File.createTempFile( + "testVCFWriter", testVCFFileName.substring(testVCFFileName.lastIndexOf("."))); + outFile.deleteOnExit(); + VCFFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), vcfHeader); + List actualVariants = new ArrayList<>(); + VCFFileReader vcfFileReaderActual = parseVcf(outFile); + Iterators.addAll(actualVariants, vcfFileReaderActual.iterator()); + + // verify the output is the same as the input + List expectedVariants = new ArrayList<>(); + VCFFileReader vcfFileReader = parseVcf(new File(testVCFFileName)); + Iterators.addAll(expectedVariants, vcfFileReader.iterator()); + + // use a VariantContextComparator to check variants are equal + VariantContextComparator vcfRecordComparator = vcfHeader.getVCFRecordComparator(); + assertEquals(expectedVariants.size(), actualVariants.size()); + for (int i = 0; i < expectedVariants.size(); i++) { + assertEquals(0, vcfRecordComparator.compare(expectedVariants.get(i), actualVariants.get(i))); } + } - private static Configuration conf; + private Path doMapReduce(final Path inputPath, final boolean writeHeader) throws Exception { + final FileSystem fileSystem = FileSystem.get(conf); + final Path outputPath = fileSystem.makeQualified(new Path("target/out")); + fileSystem.delete(outputPath, true); - private String testVCFFileName; - private Class codecClass; - private NUM_SPLITS expectedSplits; + final Job job = Job.getInstance(conf); + FileInputFormat.setInputPaths(job, inputPath); - public TestVCFRoundTrip(String filename, Class codecClass, - NUM_SPLITS expectedSplits) { - testVCFFileName = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); - this.codecClass = codecClass; - this.expectedSplits = expectedSplits; - } + job.setInputFormatClass(VCFInputFormat.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(VariantContextWritable.class); - @Before - public void setup() throws Exception { - conf = new Configuration(); - conf.set(VCFTestWithHeaderOutputFormat.READ_HEADER_FROM_FILE, testVCFFileName); - conf.setStrings("io.compression.codecs", BGZFCodec.class.getCanonicalName(), - BGZFEnhancedGzipCodec.class.getCanonicalName()); - conf.setInt(FileInputFormat.SPLIT_MAXSIZE, 100 * 1024); // 100K - } + job.setOutputFormatClass( + writeHeader ? VCFTestWithHeaderOutputFormat.class : VCFTestNoHeaderOutputFormat.class); + job.setOutputKeyClass(LongWritable.class); + job.setOutputValueClass(VariantContextWritable.class); - @Test - public void testRoundTrip() throws Exception { - Path vcfPath = new Path("file://" + testVCFFileName); - - // run a MR job to write out a VCF file - Path outputPath = doMapReduce(vcfPath, true); - - // verify the output is the same as the input - List expectedVariants = new ArrayList<>(); - VCFFileReader vcfFileReader = parseVcf(new File(testVCFFileName)); - Iterators.addAll(expectedVariants, vcfFileReader.iterator()); - - int splits = 0; - List actualVariants = new ArrayList<>(); - File[] vcfFiles = new File(outputPath.toUri()).listFiles( - pathname -> (!pathname.getName().startsWith(".") && - !pathname.getName().startsWith("_"))); - Arrays.sort(vcfFiles); // ensure files are sorted by name - for (File vcf : vcfFiles) { - splits++; - Iterators.addAll(actualVariants, parseVcf(vcf).iterator()); - if (BGZFCodec.class.equals(codecClass)) { - assertTrue(BlockCompressedInputStream.isValidFile( - new BufferedInputStream(new FileInputStream(vcf)))); - } else if (BGZFEnhancedGzipCodec.class.equals(codecClass)) { - assertTrue(VCFFormat.isGzip( - new BufferedInputStream(new FileInputStream(vcf)))); - } - } - - switch (expectedSplits) { - case EXACTLY_ONE: - assertEquals("Should be exactly one split", 1, splits); - break; - case MORE_THAN_ONE: - assertTrue("Should be more than one split", splits > 1); - break; - case ANY: - default: - break; - } - - // use a VariantContextComparator to check variants are equal - VCFHeader vcfHeader = VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new - File(testVCFFileName))); - VariantContextComparator vcfRecordComparator = vcfHeader.getVCFRecordComparator(); - assertEquals(expectedVariants.size(), actualVariants.size()); - for (int i = 0; i < expectedVariants.size(); i++) { - assertEquals(0, vcfRecordComparator.compare(expectedVariants.get(i), - actualVariants.get(i))); - } + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, outputPath); + if (codecClass != null) { + FileOutputFormat.setOutputCompressorClass(job, codecClass); } - @Test - public void testRoundTripWithMerge() throws Exception { - Path vcfPath = new Path("file://" + testVCFFileName); - - // run a MR job to write out a VCF file - Path outputPath = doMapReduce(vcfPath, false); - - // merge the output - VCFHeader vcfHeader = VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new - File(testVCFFileName))); - final File outFile = File.createTempFile("testVCFWriter", - testVCFFileName.substring(testVCFFileName.lastIndexOf("."))); - outFile.deleteOnExit(); - VCFFileMerger.mergeParts(outputPath.toUri().toString(), outFile.toURI().toString(), - vcfHeader); - List actualVariants = new ArrayList<>(); - VCFFileReader vcfFileReaderActual = parseVcf(outFile); - Iterators.addAll(actualVariants, vcfFileReaderActual.iterator()); - - // verify the output is the same as the input - List expectedVariants = new ArrayList<>(); - VCFFileReader vcfFileReader = parseVcf(new File(testVCFFileName)); - Iterators.addAll(expectedVariants, vcfFileReader.iterator()); - - // use a VariantContextComparator to check variants are equal - VariantContextComparator vcfRecordComparator = vcfHeader.getVCFRecordComparator(); - assertEquals(expectedVariants.size(), actualVariants.size()); - for (int i = 0; i < expectedVariants.size(); i++) { - assertEquals(0, vcfRecordComparator.compare(expectedVariants.get(i), - actualVariants.get(i))); - } - } + final boolean success = job.waitForCompletion(true); + assertTrue(success); + + return outputPath; + } - private Path doMapReduce(final Path inputPath, final boolean writeHeader) - throws Exception { - final FileSystem fileSystem = FileSystem.get(conf); - final Path outputPath = fileSystem.makeQualified(new Path("target/out")); - fileSystem.delete(outputPath, true); + // VCF output format that writes a header before records + static class VCFTestWithHeaderOutputFormat extends KeyIgnoringVCFOutputFormat { - final Job job = Job.getInstance(conf); - FileInputFormat.setInputPaths(job, inputPath); + public static final String READ_HEADER_FROM_FILE = "TestVCF.header"; - job.setInputFormatClass(VCFInputFormat.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(VariantContextWritable.class); + public VCFTestWithHeaderOutputFormat() { + super(VCFFormat.VCF); + } - job.setOutputFormatClass(writeHeader ? VCFTestWithHeaderOutputFormat.class : - VCFTestNoHeaderOutputFormat.class); - job.setOutputKeyClass(LongWritable.class); - job.setOutputValueClass(VariantContextWritable.class); + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx) throws IOException { + Path vcfPath = new Path(conf.get(READ_HEADER_FROM_FILE)); + readHeaderFrom(vcfPath, vcfPath.getFileSystem(conf)); + return super.getRecordWriter(ctx); + } + } - job.setNumReduceTasks(0); - FileOutputFormat.setOutputPath(job, outputPath); - if (codecClass != null) { - FileOutputFormat.setOutputCompressorClass(job, codecClass); - } + // VCF output format that doesn't write a header before records + static class VCFTestNoHeaderOutputFormat extends KeyIgnoringVCFOutputFormat { - final boolean success = job.waitForCompletion(true); - assertTrue(success); + public static final String READ_HEADER_FROM_FILE = "TestVCF.header"; - return outputPath; + public VCFTestNoHeaderOutputFormat() { + super(VCFFormat.VCF); } - private static VCFFileReader parseVcf(File vcf) throws IOException { - File actualVcf; - // work around TribbleIndexedFeatureReader not reading header from .bgz files - if (vcf.getName().endsWith(".bgz")) { - actualVcf = File.createTempFile(vcf.getName(), ".gz"); - actualVcf.deleteOnExit(); - Files.copy(vcf, actualVcf); - } else { - actualVcf = vcf; - } - return new VCFFileReader(actualVcf, false); + @Override + public RecordWriter getRecordWriter( + TaskAttemptContext ctx) throws IOException { + Path vcfPath = new Path(conf.get(READ_HEADER_FROM_FILE)); + readHeaderFrom(vcfPath, vcfPath.getFileSystem(conf)); + ctx.getConfiguration().setBoolean(WRITE_HEADER_PROPERTY, false); + return super.getRecordWriter(ctx); } + } } diff --git a/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java b/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java index 4fdb2b5..9fc079e 100644 --- a/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java +++ b/src/test/java/org/seqdoop/hadoop_bam/util/TestVCFHeaderReader.java @@ -1,18 +1,20 @@ package org.seqdoop.hadoop_bam.util; -import java.io.IOException; +import static org.junit.Assert.assertNotNull; import com.google.common.io.Resources; - import htsjdk.samtools.seekablestream.ByteArraySeekableStream; import htsjdk.samtools.seekablestream.SeekableStream; - +import java.io.IOException; import org.junit.Test; -import static org.junit.Assert.assertNotNull; - public class TestVCFHeaderReader { + static SeekableStream seekableStream(final String resource) throws IOException { + return new ByteArraySeekableStream( + Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource))); + } + @Test public void testReadHeaderFromVCF() throws IOException { assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf"))); @@ -27,8 +29,4 @@ public void testReadHeaderFromGzippedVCF() throws IOException { public void testReadHeaderFromBGZFVCF() throws IOException { assertNotNull(VCFHeaderReader.readHeaderFrom(seekableStream("test.vcf.bgzf.gz"))); } - - static SeekableStream seekableStream(final String resource) throws IOException { - return new ByteArraySeekableStream(Resources.toByteArray(ClassLoader.getSystemClassLoader().getResource(resource))); - } } diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index 4406891..9d7e245 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -1,7 +1,5 @@ -log4j.rootLogger = WARN, out - -log4j.appender.out = org.apache.log4j.ConsoleAppender -log4j.appender.out.layout = org.apache.log4j.PatternLayout -log4j.appender.out.layout.ConversionPattern = %d (%t) [%p - %l] %m%n - +log4j.rootLogger=WARN, out +log4j.appender.out=org.apache.log4j.ConsoleAppender +log4j.appender.out.layout=org.apache.log4j.PatternLayout +log4j.appender.out.layout.ConversionPattern=%d (%t) [%p - %l] %m%n log4j.logger.org.seqdoop.hadoop_bam=DEBUG From 80d964f4a6c23ab8657b3a15b62afdfa6d33b1c8 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 5 Feb 2018 15:35:56 +0000 Subject: [PATCH 2/3] Use fmt-maven-plugin to check source complies with the Google Java Style Guide. (If it doesn't the author can manually type 'mvn fmt:format' to reformat, or manually reformat in the IDE.) --- .travis.yml | 2 +- pom.xml | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8dbe2b6..49e0a6c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_install: - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts - cat /etc/hosts # optionally check the content *after* -script: mvn clean test jacoco:report +script: mvn clean fmt:check test jacoco:report after_success: - python scripts/deploy/addServerToM2Settings.py diff --git a/pom.xml b/pom.xml index cc4d60c..2124c5d 100644 --- a/pom.xml +++ b/pom.xml @@ -121,6 +121,18 @@ + + com.coveo + fmt-maven-plugin + 2.2.0 + + + + check + + + + org.apache.maven.plugins maven-release-plugin From e59cdecdddc3ee1108ba57245f44471d53bd9e84 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 7 Feb 2018 16:20:21 +0000 Subject: [PATCH 3/3] Change non-public top-level classes to be nested classes. --- .../seqdoop/hadoop_bam/BCFRecordReader.java | 120 ++++---- .../seqdoop/hadoop_bam/BCFRecordWriter.java | 86 +++--- .../hadoop_bam/LazyBAMRecordFactory.java | 148 +++++----- .../hadoop_bam/LazyVCFGenotypesContext.java | 46 +-- .../seqdoop/hadoop_bam/SAMRecordReader.java | 268 +++++++++--------- 5 files changed, 334 insertions(+), 334 deletions(-) diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java index 6b6b626..8735230 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordReader.java @@ -177,77 +177,77 @@ public boolean nextKeyValue() throws IOException { vc.set(v); return true; } -} - -class BGZFLimitingStream extends InputStream { - private final BlockCompressedInputStream bgzf; - private final long virtEnd; - private byte[] readBuf = new byte[1]; - - public BGZFLimitingStream(BlockCompressedInputStream stream, long virtualEnd) { - bgzf = stream; - virtEnd = virtualEnd; - } + static class BGZFLimitingStream extends InputStream { - @Override - public void close() throws IOException { - bgzf.close(); - } + private final BlockCompressedInputStream bgzf; + private final long virtEnd; + private byte[] readBuf = new byte[1]; - @Override - public int read() throws IOException { - switch (read(readBuf)) { - case 1: - return readBuf[0]; - case -1: - return -1; - default: - assert false; - return -1; + public BGZFLimitingStream(BlockCompressedInputStream stream, long virtualEnd) { + bgzf = stream; + virtEnd = virtualEnd; } - } - @Override - public int read(byte[] buf, int off, int len) throws IOException { - - int totalRead = 0; - long virt; - - final int lastLen = (int) virtEnd & 0xffff; - - while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { - // We're not in the last BGZF block yet. Unfortunately - // BlockCompressedInputStream doesn't expose the length of the current - // block, so we can't simply (possibly repeatedly) read the current - // block to the end. Instead, we read at most virtEnd & 0xffff at a - // time, which ensures that we can't overshoot virtEnd even if the - // next block starts immediately. - final int r = bgzf.read(buf, off, Math.min(len, lastLen)); - if (r == -1) { - return totalRead == 0 ? -1 : totalRead; - } + @Override + public void close() throws IOException { + bgzf.close(); + } - totalRead += r; - len -= r; - if (len == 0) { - return totalRead; + @Override + public int read() throws IOException { + switch (read(readBuf)) { + case 1: + return readBuf[0]; + case -1: + return -1; + default: + assert false; + return -1; } - off += r; } - // We're in the last BGZF block: read only up to lastLen. - len = Math.min(len, ((int) virt & 0xffff) - lastLen); - while (len > 0) { - final int r = bgzf.read(buf, off, len); - if (r == -1) { - return totalRead == 0 ? -1 : totalRead; + @Override + public int read(byte[] buf, int off, int len) throws IOException { + + int totalRead = 0; + long virt; + + final int lastLen = (int) virtEnd & 0xffff; + + while ((virt = bgzf.getFilePointer()) >>> 16 != virtEnd >>> 16) { + // We're not in the last BGZF block yet. Unfortunately + // BlockCompressedInputStream doesn't expose the length of the current + // block, so we can't simply (possibly repeatedly) read the current + // block to the end. Instead, we read at most virtEnd & 0xffff at a + // time, which ensures that we can't overshoot virtEnd even if the + // next block starts immediately. + final int r = bgzf.read(buf, off, Math.min(len, lastLen)); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + if (len == 0) { + return totalRead; + } + off += r; } - totalRead += r; - len -= r; - off += r; + // We're in the last BGZF block: read only up to lastLen. + len = Math.min(len, ((int) virt & 0xffff) - lastLen); + while (len > 0) { + final int r = bgzf.read(buf, off, len); + if (r == -1) { + return totalRead == 0 ? -1 : totalRead; + } + + totalRead += r; + len -= r; + off += r; + } + return totalRead == 0 ? -1 : totalRead; } - return totalRead == 0 ? -1 : totalRead; } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java index 0ec2664..6bb4327 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java +++ b/src/main/java/org/seqdoop/hadoop_bam/BCFRecordWriter.java @@ -122,57 +122,57 @@ protected void writeRecord(VariantContext vc) { writer.add(vc); } -} - -// We must always call writer.writeHeader() because the writer requires -// the header in writer.add(), and writeHeader() is the only way to give -// the header to the writer. Thus, we use this class to simply throw away -// output until after the header's been written. -// -// This is, of course, a HACK and a slightly dangerous one: if writer -// does any buffering of its own and doesn't flush after writing the -// header, this isn't as easy as this. -// -// In addition we do BGZF compression here, to simplify things. -final class BCFStoppableOutputStream extends FilterOutputStream { - private final OutputStream origOut; - public boolean stopped; - - public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { - super(new BlockCompressedOutputStream(out, null)); - origOut = out; - stopped = startStopped; - } + // We must always call writer.writeHeader() because the writer requires + // the header in writer.add(), and writeHeader() is the only way to give + // the header to the writer. Thus, we use this class to simply throw away + // output until after the header's been written. + // + // This is, of course, a HACK and a slightly dangerous one: if writer + // does any buffering of its own and doesn't flush after writing the + // header, this isn't as easy as this. + // + // In addition we do BGZF compression here, to simplify things. + static final class BCFStoppableOutputStream extends FilterOutputStream { + + private final OutputStream origOut; + public boolean stopped; + + public BCFStoppableOutputStream(boolean startStopped, OutputStream out) { + super(new BlockCompressedOutputStream(out, null)); + origOut = out; + stopped = startStopped; + } - @Override - public void write(int b) throws IOException { - if (!stopped) { - super.write(b); + @Override + public void write(int b) throws IOException { + if (!stopped) { + super.write(b); + } } - } - @Override - public void write(byte[] b) throws IOException { - if (!stopped) { - super.write(b); + @Override + public void write(byte[] b) throws IOException { + if (!stopped) { + super.write(b); + } } - } - @Override - public void write(byte[] b, int off, int len) throws IOException { - if (!stopped) { - super.write(b, off, len); + @Override + public void write(byte[] b, int off, int len) throws IOException { + if (!stopped) { + super.write(b, off, len); + } } - } - @Override - public void close() throws IOException { - // Don't close the BlockCompressedOutputStream, as we don't want - // the BGZF terminator. - this.out.flush(); + @Override + public void close() throws IOException { + // Don't close the BlockCompressedOutputStream, as we don't want + // the BGZF terminator. + this.out.flush(); - // Instead, close the lower-level output stream directly. - origOut.close(); + // Instead, close the lower-level output stream directly. + origOut.close(); + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java index 3a908b1..a1faf90 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyBAMRecordFactory.java @@ -65,89 +65,89 @@ public BAMRecord createBAMRecord( insertSize, variableLengthBlock); } -} - -class LazyBAMRecord extends BAMRecord { - - private boolean decodedRefIdx = false; - private boolean decodedMateRefIdx = false; - public LazyBAMRecord( - SAMFileHeader hdr, - int referenceID, - int coordinate, - short readNameLength, - short mappingQuality, - int indexingBin, - int cigarLen, - int flags, - int readLen, - int mateReferenceID, - int mateCoordinate, - int insertSize, - byte[] restOfData) { - super( - hdr, - referenceID, - coordinate, - readNameLength, - mappingQuality, - indexingBin, - cigarLen, - flags, - readLen, - mateReferenceID, - mateCoordinate, - insertSize, - restOfData); - } + static class LazyBAMRecord extends BAMRecord { + + private boolean decodedRefIdx = false; + private boolean decodedMateRefIdx = false; + + public LazyBAMRecord( + SAMFileHeader hdr, + int referenceID, + int coordinate, + short readNameLength, + short mappingQuality, + int indexingBin, + int cigarLen, + int flags, + int readLen, + int mateReferenceID, + int mateCoordinate, + int insertSize, + byte[] restOfData) { + super( + hdr, + referenceID, + coordinate, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceID, + mateCoordinate, + insertSize, + restOfData); + } - @Override - public void setReferenceIndex(final int referenceIndex) { - mReferenceIndex = referenceIndex; - decodedRefIdx = false; - } + @Override + public void setReferenceIndex(final int referenceIndex) { + mReferenceIndex = referenceIndex; + decodedRefIdx = false; + } - @Override - public void setMateReferenceIndex(final int referenceIndex) { - mMateReferenceIndex = referenceIndex; - decodedMateRefIdx = false; - } + @Override + public void setMateReferenceIndex(final int referenceIndex) { + mMateReferenceIndex = referenceIndex; + decodedMateRefIdx = false; + } - @Override - public String getReferenceName() { - if (mReferenceIndex != null && !decodedRefIdx) { - decodedRefIdx = true; - super.setReferenceIndex(mReferenceIndex); + @Override + public String getReferenceName() { + if (mReferenceIndex != null && !decodedRefIdx) { + decodedRefIdx = true; + super.setReferenceIndex(mReferenceIndex); + } + return super.getReferenceName(); } - return super.getReferenceName(); - } - @Override - public String getMateReferenceName() { - if (mMateReferenceIndex != null && !decodedMateRefIdx) { - decodedMateRefIdx = true; - super.setMateReferenceIndex(mMateReferenceIndex); + @Override + public String getMateReferenceName() { + if (mMateReferenceIndex != null && !decodedMateRefIdx) { + decodedMateRefIdx = true; + super.setMateReferenceIndex(mMateReferenceIndex); + } + return super.getMateReferenceName(); } - return super.getMateReferenceName(); - } - @Override - protected void eagerDecode() { - getReferenceName(); - getMateReferenceName(); - super.eagerDecode(); - } + @Override + protected void eagerDecode() { + getReferenceName(); + getMateReferenceName(); + super.eagerDecode(); + } - @Override - public boolean equals(Object o) { - // don't use decoded flags for equality check - return super.equals(o); - } + @Override + public boolean equals(Object o) { + // don't use decoded flags for equality check + return super.equals(o); + } - @Override - public int hashCode() { - // don't use decoded flags for hash code - return super.hashCode(); + @Override + public int hashCode() { + // don't use decoded flags for hash code + return super.hashCode(); + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java index 3fca4a9..3922d78 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java +++ b/src/main/java/org/seqdoop/hadoop_bam/LazyVCFGenotypesContext.java @@ -101,34 +101,34 @@ public LazyGenotypesContext.LazyData parse(final Object data) { return codec.createGenotypeMap(str, alleles, chrom, start); } } -} -// This is a HACK. But, the functionality is only in AbstractVCFCodec so it -// can't be helped. This is preferable to copying the functionality into -// parse() above. -class HeaderSettableVCFCodec extends AbstractVCFCodec { + // This is a HACK. But, the functionality is only in AbstractVCFCodec so it + // can't be helped. This is preferable to copying the functionality into + // parse() above. + static class HeaderSettableVCFCodec extends AbstractVCFCodec { - public boolean hasHeader() { - return header != null; - } + public boolean hasHeader() { + return header != null; + } - public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { - this.header = header; - this.version = ver; - } + public void setHeaderAndVersion(VCFHeader header, VCFHeaderVersion ver) { + this.header = header; + this.version = ver; + } - @Override - public Object readActualHeader(LineIterator reader) { - throw new UnsupportedOperationException("Internal error: this shouldn't be called"); - } + @Override + public Object readActualHeader(LineIterator reader) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } - @Override - public List parseFilters(String filterString) { - throw new UnsupportedOperationException("Internal error: this shouldn't be called"); - } + @Override + public List parseFilters(String filterString) { + throw new UnsupportedOperationException("Internal error: this shouldn't be called"); + } - @Override - public boolean canDecode(String s) { - return true; + @Override + public boolean canDecode(String s) { + return true; + } } } diff --git a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java index d6159d4..0a52d08 100644 --- a/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java +++ b/src/main/java/org/seqdoop/hadoop_bam/SAMRecordReader.java @@ -191,172 +191,172 @@ public boolean nextKeyValue() { record.set(r); return true; } -} -// See the long comment in SAMRecordReader.initialize() for what this does. -class WorkaroundingStream extends InputStream { + // See the long comment in SAMRecordReader.initialize() for what this does. + static class WorkaroundingStream extends InputStream { - private final InputStream stream, headerStream; - private boolean headerRemaining; - private long length; - private int headerLength; + private final InputStream stream, headerStream; + private boolean headerRemaining; + private long length; + private int headerLength; - private boolean lookingForEOL = false, - foundEOL = false, - strippingAts = false; // HACK, see read(byte[], int, int). - private byte[] readBuf = new byte[1]; + private boolean lookingForEOL = false, + foundEOL = false, + strippingAts = false; // HACK, see read(byte[], int, int). + private byte[] readBuf = new byte[1]; - public WorkaroundingStream(InputStream stream, SAMFileHeader header) { - this.stream = stream; + public WorkaroundingStream(InputStream stream, SAMFileHeader header) { + this.stream = stream; - String text = header.getTextHeader(); - if (text == null) { - StringWriter writer = new StringWriter(); - new SAMTextHeaderCodec().encode(writer, header); - text = writer.toString(); - } - byte[] b; - try { - b = text.getBytes("UTF-8"); - } catch (UnsupportedEncodingException e) { - b = null; - assert false; - } - headerRemaining = true; - headerLength = b.length; - headerStream = new ByteArrayInputStream(b); + String text = header.getTextHeader(); + if (text == null) { + StringWriter writer = new StringWriter(); + new SAMTextHeaderCodec().encode(writer, header); + text = writer.toString(); + } + byte[] b; + try { + b = text.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + b = null; + assert false; + } + headerRemaining = true; + headerLength = b.length; + headerStream = new ByteArrayInputStream(b); - this.length = Long.MAX_VALUE; - } + this.length = Long.MAX_VALUE; + } - public void setLength(long length) { - this.length = length; - } + public void setLength(long length) { + this.length = length; + } - public int getRemainingHeaderLength() { - return headerLength; - } + public int getRemainingHeaderLength() { + return headerLength; + } - @Override - public int read() throws IOException { - for (; ; ) { - switch (read(readBuf)) { - case 0: - continue; - case 1: - return readBuf[0]; - case -1: - return -1; + @Override + public int read() throws IOException { + for (; ; ) { + switch (read(readBuf)) { + case 0: + continue; + case 1: + return readBuf[0]; + case -1: + return -1; + } } } - } - @Override - public int read(byte[] buf, int off, int len) throws IOException { - if (!headerRemaining) { - return streamRead(buf, off, len); - } + @Override + public int read(byte[] buf, int off, int len) throws IOException { + if (!headerRemaining) { + return streamRead(buf, off, len); + } - int h; - if (strippingAts) { - h = 0; - } else { - h = headerStream.read(buf, off, len); - if (h == -1) { - // This should only happen when there was no header at all, in - // which case Picard doesn't throw an error until trying to read - // a record, for some reason. (Perhaps an oversight.) Thus we - // need to handle that case here. - assert (headerLength == 0); + int h; + if (strippingAts) { h = 0; - } else if (h < headerLength) { - headerLength -= h; - return h; + } else { + h = headerStream.read(buf, off, len); + if (h == -1) { + // This should only happen when there was no header at all, in + // which case Picard doesn't throw an error until trying to read + // a record, for some reason. (Perhaps an oversight.) Thus we + // need to handle that case here. + assert (headerLength == 0); + h = 0; + } else if (h < headerLength) { + headerLength -= h; + return h; + } + strippingAts = true; + headerStream.close(); } - strippingAts = true; - headerStream.close(); - } - final int newOff = off + h; - int s = streamRead(buf, newOff, len - h); + final int newOff = off + h; + int s = streamRead(buf, newOff, len - h); - if (s <= 0) { - return strippingAts ? s : h; - } + if (s <= 0) { + return strippingAts ? s : h; + } - // HACK HACK HACK. - // - // We gave all of the header, which means that SAMFileReader is still - // trying to read more header lines. If we're in a split that isn't at - // the start of the SAM file, we could be in the middle of a line and - // thus see @ characters at the start of our data. Then SAMFileReader - // would try to understand those as header lines and the end result is - // that it throws an error, since they aren't actually header lines, - // they're just part of a SAM record. - // - // So, if we're done with the header, strip all @ characters we see. Thus - // SAMFileReader will stop reading the header there and won't throw an - // exception until we use its SAMRecordIterator, at which point we can - // catch it, because we know to expect it. - // - // headerRemaining remains true while it's possible that there are still - // @ characters coming. + // HACK HACK HACK. + // + // We gave all of the header, which means that SAMFileReader is still + // trying to read more header lines. If we're in a split that isn't at + // the start of the SAM file, we could be in the middle of a line and + // thus see @ characters at the start of our data. Then SAMFileReader + // would try to understand those as header lines and the end result is + // that it throws an error, since they aren't actually header lines, + // they're just part of a SAM record. + // + // So, if we're done with the header, strip all @ characters we see. Thus + // SAMFileReader will stop reading the header there and won't throw an + // exception until we use its SAMRecordIterator, at which point we can + // catch it, because we know to expect it. + // + // headerRemaining remains true while it's possible that there are still + // @ characters coming. + + int i = newOff - 1; + while (buf[++i] == '@' && --s > 0) {; + } - int i = newOff - 1; - while (buf[++i] == '@' && --s > 0) {; - } + if (i != newOff) { + System.arraycopy(buf, i, buf, newOff, s); + } - if (i != newOff) { - System.arraycopy(buf, i, buf, newOff, s); + headerRemaining = s == 0; + return h + s; } - headerRemaining = s == 0; - return h + s; - } - - private int streamRead(byte[] buf, int off, int len) throws IOException { - if (len > length) { - if (foundEOL) { - return 0; + private int streamRead(byte[] buf, int off, int len) throws IOException { + if (len > length) { + if (foundEOL) { + return 0; + } + lookingForEOL = true; } - lookingForEOL = true; - } - int n = stream.read(buf, off, len); - if (n > 0) { - n = tryFindEOL(buf, off, n); - length -= n; + int n = stream.read(buf, off, len); + if (n > 0) { + n = tryFindEOL(buf, off, n); + length -= n; + } + return n; } - return n; - } - private int tryFindEOL(byte[] buf, int off, int len) { - assert !foundEOL; + private int tryFindEOL(byte[] buf, int off, int len) { + assert !foundEOL; - if (!lookingForEOL || len < length) { - return len; - } + if (!lookingForEOL || len < length) { + return len; + } - // Find the first EOL between length and len. + // Find the first EOL between length and len. - // len >= length so length fits in an int. - int i = Math.max(0, (int) length - 1); + // len >= length so length fits in an int. + int i = Math.max(0, (int) length - 1); - for (; i < len; ++i) { - if (buf[off + i] == '\n') { - foundEOL = true; - return i + 1; + for (; i < len; ++i) { + if (buf[off + i] == '\n') { + foundEOL = true; + return i + 1; + } } + return len; } - return len; - } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException { + stream.close(); + } - @Override - public int available() throws IOException { - return headerRemaining ? headerStream.available() : stream.available(); + @Override + public int available() throws IOException { + return headerRemaining ? headerStream.available() : stream.available(); + } } }