Skip to content

Commit 492874b

Browse files
committed
BUG Fix parsing of GFF files
It was assumed that the score field was always non-negative. Reported by Josh Sekela on the mailing-list: https://groups.google.com/g/ngless/c/kf6y2MWBfec/m/2DicyAH3DwAJ
1 parent 89c4b0d commit 492874b

File tree

5 files changed

+45
-16
lines changed

5 files changed

+45
-16
lines changed

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
Version 1.4.2 2022-07-21 by luispedro
2+
* Fix bug with parsing GFF files
3+
14
Version 1.4.1 2022-06-03 by luispedro
25
* Fix bug with split mapping
36
* Fix packaging for hackage

NGLess/Data/GFF.hs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{- Copyright 2013-2019 NGLess Authors
1+
{- Copyright 2013-2022 NGLess Authors
22
- License: MIT -}
33
{-# LANGUAGE CPP #-}
44

@@ -81,31 +81,32 @@ readGffLine :: B.ByteString -> Either NGError GffLine
8181
readGffLine line = case B8.split '\t' line of
8282
[tk0,tk1,tk2,tk3,tk4,tk5,tk6,tk7,tk8] ->
8383
GffLine
84-
tk0
85-
tk1
86-
tk2
87-
<$> intOrError tk3
88-
<*> intOrError tk4
89-
<*> score tk5
84+
tk0 -- seq id
85+
tk1 -- source
86+
tk2 -- type
87+
<$> intOrError "reading start" tk3 -- start
88+
<*> intOrError "reading end" tk4 -- end
89+
<*> score tk5 -- score
9090
<*> strandOrError tk6
9191
<*> phase tk7
9292
<*> pure (_parseGffAttributes tk8)
9393
_ -> throwDataError ("unexpected line in GFF: " ++ show line)
9494
where
95-
parseOrError :: (a -> Maybe b) -> a -> NGLess b
96-
parseOrError p s = case p s of
95+
parseOrError :: String -> (B.ByteString -> Maybe b) -> B.ByteString -> NGLess b
96+
parseOrError context p s = case p s of
9797
Just v -> return v
98-
Nothing -> throwDataError $ "Could not parse GFF line: "++ show line
99-
intOrError :: B.ByteString -> NGLess Int
100-
intOrError = parseOrError (liftM fst . I.readDecimal)
101-
floatOrError = parseOrError (liftM fst . F.readDecimal)
98+
Nothing -> throwDataError $ "Could not parse GFF line (" ++ context ++ ", while parsing '" ++ B8.unpack s ++ "'): "++ show line
99+
intOrError :: String -> B.ByteString -> NGLess Int
100+
intOrError c = parseOrError c (liftM fst . I.readDecimal)
101+
floatOrError c = parseOrError c (liftM fst . (F.readSigned F.readDecimal))
102+
102103
score :: B.ByteString -> NGLess (Maybe Float)
103104
score "." = return Nothing
104-
score v = Just <$> floatOrError v
105+
score v = Just <$> floatOrError "reading score" v
105106

106107
phase :: B.ByteString -> NGLess Int
107108
phase "." = return (-1)
108-
phase r = intOrError r
109+
phase r = intOrError "reading phase" r
109110
strandOrError :: B.ByteString -> NGLess GffStrand
110111
strandOrError s = case B8.uncons s of
111112
Just (s',_) -> parseStrand s'

NGLess/Interpretation/Count.hs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -792,10 +792,23 @@ loadGFF gffFp opts = do
792792
outputListLno' TraceOutput ["Loading GFF file '", gffFp, "'..."]
793793
numCapabilities <- liftIO getNumCapabilities
794794
let mapthreads = max 1 (numCapabilities - 1)
795+
annotateErrorReader :: (Int, V.Vector ByteLine) -> NGLess (V.Vector GffLine)
796+
annotateErrorReader (ch_ix, ells) =
797+
case V.mapM (readGffLine. unwrapByteLine) . V.filter (not . isComment) $ ells of
798+
r@Right{} -> r
799+
_ -> do
800+
forM_ (zip [0..] $ V.toList ells) $ \(i, ell) ->
801+
if isComment ell
802+
then return ()
803+
else case readGffLine (unwrapByteLine ell) of
804+
Right{} -> return ()
805+
Left (NGError errtype errmsg) -> Left (NGError errtype (errmsg ++ " (Line " ++ show (8192 * ch_ix + i + 1) ++ ")"))
806+
throwShouldNotOccur "annotateErrorReader: this should never happen"
795807
partials <- C.runConduit $
796808
conduitPossiblyCompressedFile gffFp
797809
.| linesVC 8192
798-
.| CAlg.asyncMapEitherC mapthreads (V.mapM (readGffLine . unwrapByteLine) . V.filter (not . isComment))
810+
.| CAlg.enumerateC
811+
.| CAlg.asyncMapEitherC mapthreads annotateErrorReader
799812
.| sequenceSinks
800813
[CL.foldM (insertgV f sf) (GffLoadingState M.empty M.empty)
801814
| f <- optFeatures opts

docs/sources/whatsnew.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22
What's New (History)
33
====================
44

5+
Version 1.4.2
6+
-------------
7+
8+
Released *21 July 2022*
9+
10+
Bugfixes
11+
~~~~~~~~
12+
13+
- Fix bug with parsing GFF files (it was assumed that _scores_ were always positive)
14+
515
Version 1.4.1
616
-------------
717

test_samples/short.gtf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,3 +1885,5 @@ X protein_coding CDS 12148155 12148247 . + 0 gene_id "WBGene00010924"; transcrip
18851885
X protein_coding stop_codon 12148248 12148250 . + 0 gene_id "WBGene00010924"; transcript_id "M153.1"; exon_number "5"; gene_name "M153.1"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "M153.1"; transcript_source "ensembl";
18861886
X protein_coding UTR 12146817 12146844 . + . gene_id "WBGene00010924"; transcript_id "M153.1"; gene_name "M153.1"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "M153.1"; transcript_source "ensembl";
18871887
X protein_coding UTR 12148251 12148471 . + . gene_id "WBGene00010924"; transcript_id "M153.1"; gene_name "M153.1"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "M153.1"; transcript_source "ensembl";
1888+
k141_708 Prodigal_v2.6.3 CDS 1573 2394 46.2 - 0 ID=702_5;partial=00;start_type=ATG;rbs_motif=GGAG/GAGG;rbs_spacer=5-10bp;gc_cont=0.482;conf=100.00;score=45.55;cscore=32.79;sscore=12.76;rscore=2.32;uscore=7.11;tscore=3.98;
1889+
k141_708 Prodigal_v2.6.3 CDS 1451 1576 -0.7 - 0 ID=702_4;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.444;conf=55.61;score=0.98;cscore=-3.53;sscore=4.51;rscore=-0.37;uscore=1.76;tscore=1.96;

0 commit comments

Comments
 (0)