Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.stack-work/
35 changes: 18 additions & 17 deletions app/Main.hs
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
module Main where
module Main (main) where

import System.IO (IOMode(ReadMode), Handle, openFile, hGetChar, hGetLine, hIsEOF)
import Data.Map.Strict (Map, empty, insertWith)
import Data.Vector.Unboxed (Vector(..))
import Data.Vector.Unboxed (Vector)
import qualified Data.Vector.Unboxed as Vec
import Data.Word (Word16(..))
import Data.Word (Word16)
import System.IO (Handle, IOMode (ReadMode), hGetChar,
hGetLine, hIsEOF, openFile)

data Nucleotide = A | C | T | G deriving (Enum, Eq, Ord, Show)

type GWord = [Nucleotide]

codeWord :: GWord -> Word16
codeWord gWord = fromIntegral (sum [4^i * (fromEnum nuc) | (i, nuc) <- zip [0..] gWord])
codeWord gWord =
fromIntegral $
sum [4 ^ i * fromEnum nuc | (i, nuc) <- zip [0 :: Int ..] gWord]

decodeWord :: Word16 -> GWord
decodeWord word = [toEnum (fromIntegral((div word (4^i)) `mod` 4)) | i <- [0..5]]
-- decodeWord :: Word16 -> GWord
-- decodeWord word =
-- [toEnum . fromIntegral $ word `div` 4 ^ i `mod` 4 | i <- [0 .. 5 :: Int]]

readNucleotide :: Char -> Maybe Nucleotide
readNucleotide symbol =
Expand All @@ -35,10 +38,9 @@ readWord nuclCount file = do
case readNucleotide symbol of
Just nucleotide -> do
gWord <- readWord (nuclCount - 1) file
pure ([nucleotide] ++ gWord)
Nothing -> do
gWord <- readWord nuclCount file
pure gWord
pure $ nucleotide : gWord
Nothing ->
readWord nuclCount file
else
pure []

Expand All @@ -47,8 +49,7 @@ countWords filepath = do
genomeHandle <- openFile filepath ReadMode
_ <- hGetLine genomeHandle
firstWord <- readWord 6 genomeHandle
gWords <- countWord firstWord (Vec.replicate 4096 0) genomeHandle
pure gWords
countWord firstWord (Vec.replicate 4096 0) genomeHandle

countWord :: GWord -> Vector Word16 -> Handle -> IO (Vector Word16)
countWord gWord vector handle = do
Expand All @@ -57,7 +58,7 @@ countWord gWord vector handle = do
symbol <- hGetChar handle
case readNucleotide symbol of
Just nucleotide -> do
let newWord = (tail gWord) ++ [nucleotide]
let newWord = tail gWord ++ [nucleotide]
countWord
newWord
(Vec.accum (+) vector [(fromIntegral $ codeWord gWord, 1)])
Expand All @@ -69,5 +70,5 @@ countWord gWord vector handle = do

main :: IO ()
main = do
words <- countWords "genome.fna"
print words
wordsCount <- countWords "genome.fna"
print wordsCount