Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions src/crowsetta/formats/bbox/audbbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import attr
import pandas as pd
import pandera
import pandera.pandas
from pandera.typing import Series

import crowsetta
Expand Down Expand Up @@ -97,21 +97,18 @@ def df_to_lines(df: pd.DataFrame) -> list[str]:
return lines


class AudBBoxSchema(pandera.DataFrameModel):
"""A :class:`pandera.DataFrameModel

` that
validates :mod:`pandas` dataframes
loaded from Audacity label tracks
in extended format, exported to txt files
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Extended_format_with_frequency_ranges
class AudBBoxSchema(pandera.pandas.DataFrameModel):
"""A :class:`pandera.pandas.DataFrameModel` that
validates :mod:`pandas` dataframes loaded from Audacity label tracks
in extended format then exported to txt files
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Extended_format_with_frequency_ranges
"""

begin_time_s: Series[float] = pandera.Field(coerce=True)
end_time_s: Series[float] = pandera.Field(coerce=True)
label: Series[pd.StringDtype] = pandera.Field(coerce=True)
low_freq_hz: Series[float] = pandera.Field(coerce=True)
high_freq_hz: Series[float] = pandera.Field(coerce=True)
begin_time_s: Series[float] = pandera.pandas.Field(coerce=True)
end_time_s: Series[float] = pandera.pandas.Field(coerce=True)
label: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)
low_freq_hz: Series[float] = pandera.pandas.Field(coerce=True)
high_freq_hz: Series[float] = pandera.pandas.Field(coerce=True)

class Config:
ordered = True
Expand Down
21 changes: 9 additions & 12 deletions src/crowsetta/formats/bbox/raven.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,23 @@

import attr
import pandas as pd
import pandera
import pandera.pandas
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


class RavenSchema(pandera.DataFrameModel):
"""A :class:`pandera.DataFrameModel

` that validates :type:`pandas.DataFrame`s
loaded from a txt file, created by exporting a Selection Table
from Raven.
class RavenSchema(pandera.pandas.DataFrameModel):
"""A :class:`pandera.pandas.DataFrameModel` that validates a :type:`pandas.DataFrame`
loaded from a txt file, created by exporting a Selection Table from Raven.
"""

begin_time_s: Series[float] = pandera.Field()
end_time_s: Series[float] = pandera.Field()
low_freq_hz: Series[float] = pandera.Field()
high_freq_hz: Series[float] = pandera.Field()
annotation: Series[pd.StringDtype] = pandera.Field(coerce=True)
begin_time_s: Series[float] = pandera.pandas.Field()
end_time_s: Series[float] = pandera.pandas.Field()
low_freq_hz: Series[float] = pandera.pandas.Field()
high_freq_hz: Series[float] = pandera.pandas.Field()
annotation: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)

class Config:
# we set strict fo False
Expand Down
24 changes: 10 additions & 14 deletions src/crowsetta/formats/seq/audseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,24 @@
import attr
import numpy as np
import pandas as pd
import pandera
import pandera.pandas
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


class AudSeqSchema(pandera.DataFrameModel):
"""A :class:`pandera.DataFrameModel
class AudSeqSchema(pandera.pandas.DataFrameModel):
"""A :class:`pandera.DataFrameModel` that validates a :type:`pandas.DataFrame`
loaded from Audacity Labeltrack annotations exported to txt files in the standard format.

`
that validates :type:`pandas.DataFrame`s
loaded from Audacity Labeltrack annotations
exported to txt files in the standard format.

The standard format is described here:
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
The standard format is described here:
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
"""

start_time: Optional[Series[float]] = pandera.Field()
end_time: Optional[Series[float]] = pandera.Field()
label: Series[pd.StringDtype] = pandera.Field(coerce=True)
start_time: Optional[Series[float]] = pandera.pandas.Field()
end_time: Optional[Series[float]] = pandera.pandas.Field()
label: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)

class Config:
ordered = True
Expand All @@ -42,7 +38,7 @@ class Config:
class AudSeq:
"""Class meant to represent
Audacity Labeltrack annotations
exported to txt files in the standard format[1]_.
exported to txt files in the standard format [1]_.

The txt file will have 3 tab-separated columns
that represent the start time, end time, and labels
Expand Down
39 changes: 18 additions & 21 deletions src/crowsetta/formats/seq/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import attr
import pandas as pd
import pandera
import pandera.pandas
from pandera.typing import Series

import crowsetta
Expand All @@ -36,36 +36,33 @@
"""


class GenericSeqSchema(pandera.DataFrameModel):
"""A :class: `pandera.DataFrameModel

` that validates
:type:`pandas.DataFrame`s
loaded from a csv file in the ``'generic-seq'`` annotation
format.
class GenericSeqSchema(pandera.pandas.DataFrameModel):
"""A :class: `pandera.pandas.DataFrameModel` that validates
a :type:`pandas.DataFrame` loaded from a csv file
in the ``'generic-seq'`` annotation format.
"""

label: Series[pd.StringDtype] = pandera.Field(coerce=True)
onset_s: Optional[Series[float]] = pandera.Field()
offset_s: Optional[Series[float]] = pandera.Field()
onset_sample: Optional[Series[int]] = pandera.Field()
offset_sample: Optional[Series[int]] = pandera.Field()
label: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)
onset_s: Optional[Series[float]] = pandera.pandas.Field()
offset_s: Optional[Series[float]] = pandera.pandas.Field()
onset_sample: Optional[Series[int]] = pandera.pandas.Field()
offset_sample: Optional[Series[int]] = pandera.pandas.Field()

notated_path: Series[str] = pandera.Field(coerce=True)
annot_path: Series[str] = pandera.Field(coerce=True)
sequence: Series[int] = pandera.Field()
annotation: Series[int] = pandera.Field()
notated_path: Series[str] = pandera.pandas.Field(coerce=True)
annot_path: Series[str] = pandera.pandas.Field(coerce=True)
sequence: Series[int] = pandera.pandas.Field()
annotation: Series[int] = pandera.pandas.Field()

@pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
@pandera.pandas.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
def both_onset_s_and_offset_s_if_either(cls, df: pd.DataFrame) -> bool:
"""check that, if one of {'onset_s', 'offset_s'} column is present,
then both are present"""
if any([col in df for col in ("onset_s", "offset_s")]):
return all([col in df for col in ("onset_s", "offset_s")])
else:
return True

@pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
@pandera.pandas.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
def both_onset_sample_and_offset_sample_if_either(cls, df: pd.DataFrame) -> bool:
"""check that, if one of {'onset_sample', 'offset_sample'} column is present,
then both are present"""
Expand All @@ -74,7 +71,7 @@ def both_onset_sample_and_offset_sample_if_either(cls, df: pd.DataFrame) -> bool
else:
return True

@pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
@pandera.pandas.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
def onset_offset_s_and_ind_are_not_both_missing(cls, df: pd.DataFrame) -> bool:
"""check that at least one of the on/offset column pairs is present:
either {'onset_s', 'offset_s'} or {'onset_sample', 'offset_sample'}"""
Expand Down
14 changes: 7 additions & 7 deletions src/crowsetta/formats/seq/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,26 @@
import attr
import numpy as np
import pandas as pd
import pandera
import pandera.pandas
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


class SimpleSeqSchema(pandera.DataFrameModel):
"""A :class:`pandera.DataFrameModel`
that validates :type:`pandas.DataFrame`s
class SimpleSeqSchema(pandera.pandas.DataFrameModel):
"""A :class:`pandera.pandas.DataFrameModel`
that validates a :type:`pandas.DataFrame`
loaded from a csv or txt file in a 'simple-seq' format.

The :meth:`SimpleSeq.from_file` loads the :type:`pandas.DataFrame`
and makes any changes needed to get it to this format
before validation, e.g., changing column names.
"""

onset_s: Optional[Series[float]] = pandera.Field()
offset_s: Optional[Series[float]] = pandera.Field()
label: Series[pd.StringDtype] = pandera.Field(coerce=True)
onset_s: Optional[Series[float]] = pandera.pandas.Field()
offset_s: Optional[Series[float]] = pandera.pandas.Field()
label: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)

class Config:
ordered = True
Expand Down
24 changes: 11 additions & 13 deletions src/crowsetta/formats/seq/timit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,27 @@
import attr
import numpy as np
import pandas as pd
import pandera
import pandera.pandas
import soundfile
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


class TimitTranscriptSchema(pandera.DataFrameModel):
"""A :class:`pandera.DataFrameModel
class TimitTranscriptSchema(pandera.pandas.DataFrameModel):
"""A :class:`pandera.pandas.DataFrameModel` that validates a :type:`pandas.DataFrame`
loaded from a phn or wrd file in the TIMIT[1]_ transcription format.

` that validates :type:`pandas.DataFrame`s
loaded from a phn or wrd file in the TIMIT[1]_ transcription format.

References
----------
.. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
Web Download. Philadelphia: Linguistic Data Consortium, 1993.
References
----------
.. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
Web Download. Philadelphia: Linguistic Data Consortium, 1993.
"""

begin_sample: Optional[Series[int]] = pandera.Field()
end_sample: Optional[Series[int]] = pandera.Field()
text: Series[pd.StringDtype] = pandera.Field(coerce=True)
begin_sample: Optional[Series[int]] = pandera.pandas.Field()
end_sample: Optional[Series[int]] = pandera.pandas.Field()
text: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)

class Config:
ordered = True
Expand Down
1 change: 0 additions & 1 deletion tests/test_formats/test_seq/test_simple.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import filecmp
import inspect
import tempfile

import numpy as np
import pandas as pd
Expand Down