Skip to content

Commit

Permalink
Add in Bioinitio foundation
Browse files Browse the repository at this point in the history
  • Loading branch information
milnus committed Oct 11, 2021
1 parent 2469b09 commit a81b6dd
Show file tree
Hide file tree
Showing 27 changed files with 644 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/phupa.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM python:3.7.3-stretch
WORKDIR /Magphi
COPY . .

# Install the python package (and executable)
RUN pip3 install .

# Override some of the dependencies with the hard-coded versions
RUN pip3 install -r requirements-dev.txt
7 changes: 7 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Copyright 11 Oct 2021 Magnus Ganer Jespersen

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 changes: 26 additions & 0 deletions Magphi.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env cwl-runner

baseCommand: [Magphi]
class: CommandLineTool
cwlVersion: v1.0
id: Magphi
inputs:
- doc: Input FASTA files
id: fasta_file
inputBinding: {position: 0}
type: File
- doc: Minimum length sequence to include in stats (default 0)
id: min_len
inputBinding: {prefix: --minlen}
type: long?
- doc: record program progress in LOG_FILE
id: log
inputBinding: {prefix: --log}
type: string?
outputs:
- doc: Stats file
id: stats
type: stdout
requirements:
- class: DockerRequirement
dockerPull: magphi
1 change: 1 addition & 0 deletions Magphi/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
236 changes: 236 additions & 0 deletions Magphi/Magphi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
'''
Module : Main
Description : The main entry point for the program.
Copyright : (c) Magnus Ganer Jespersen, 11 Oct 2021
License : MIT
Maintainer : magnus.ganer.j@gmail.com
Portability : POSIX
The program reads one or more input FASTA files. For each file it computes a
variety of statistics, and then prints a summary of the statistics as output.
'''

from argparse import ArgumentParser
from math import floor
import sys
import logging
import pkg_resources
from Bio import SeqIO


EXIT_FILE_IO_ERROR = 1
EXIT_COMMAND_LINE_ERROR = 2
EXIT_FASTA_FILE_ERROR = 3
DEFAULT_MIN_LEN = 0
DEFAULT_VERBOSE = False
HEADER = 'FILENAME\tNUMSEQ\tTOTAL\tMIN\tAVG\tMAX'
PROGRAM_NAME = "Magphi"


try:
PROGRAM_VERSION = pkg_resources.require(PROGRAM_NAME)[0].version
except pkg_resources.DistributionNotFound:
PROGRAM_VERSION = "undefined_version"


def exit_with_error(message, exit_status):
'''Print an error message to stderr, prefixed by the program name and 'ERROR'.
Then exit program with supplied exit status.
Arguments:
message: an error message as a string.
exit_status: a positive integer representing the exit status of the
program.
'''
logging.error(message)
print("{} ERROR: {}, exiting".format(PROGRAM_NAME, message), file=sys.stderr)
sys.exit(exit_status)


def parse_args():
'''Parse command line arguments.
Returns Options object with command line argument values as attributes.
Will exit the program on a command line error.
'''
description = 'Read one or more FASTA files, compute simple stats for each file'
parser = ArgumentParser(description=description)
parser.add_argument(
'--minlen',
metavar='N',
type=int,
default=DEFAULT_MIN_LEN,
help='Minimum length sequence to include in stats (default {})'.format(
DEFAULT_MIN_LEN))
parser.add_argument('--version',
action='version',
version='%(prog)s ' + PROGRAM_VERSION)
parser.add_argument('--log',
metavar='LOG_FILE',
type=str,
help='record program progress in LOG_FILE')
parser.add_argument('fasta_files',
nargs='*',
metavar='FASTA_FILE',
type=str,
help='Input FASTA files')
return parser.parse_args()


class FastaStats(object):
'''Compute various statistics for a FASTA file:
num_seqs: the number of sequences in the file satisfying the minimum
length requirement (minlen_threshold).
num_bases: the total length of all the counted sequences.
min_len: the minimum length of the counted sequences.
max_len: the maximum length of the counted sequences.
average: the average length of the counted sequences rounded down
to an integer.
'''
#pylint: disable=too-many-arguments
def __init__(self,
num_seqs=None,
num_bases=None,
min_len=None,
max_len=None,
average=None):
"Build an empty FastaStats object"
self.num_seqs = num_seqs
self.num_bases = num_bases
self.min_len = min_len
self.max_len = max_len
self.average = average

def __eq__(self, other):
"Two FastaStats objects are equal iff their attributes are equal"
if type(other) is type(self):
return self.__dict__ == other.__dict__
return False

def __repr__(self):
"Generate a printable representation of a FastaStats object"
return "FastaStats(num_seqs={}, num_bases={}, min_len={}, max_len={}, " \
"average={})".format(
self.num_seqs, self.num_bases, self.min_len, self.max_len,
self.average)

def from_file(self, fasta_file, minlen_threshold=DEFAULT_MIN_LEN):
'''Compute a FastaStats object from an input FASTA file.
Arguments:
fasta_file: an open file object for the FASTA file
minlen_threshold: the minimum length sequence to consider in
computing the statistics. Sequences in the input FASTA file
which have a length less than this value are ignored and not
considered in the resulting statistics.
Result:
A FastaStats object
'''
num_seqs = num_bases = 0
min_len = max_len = None
for seq in SeqIO.parse(fasta_file, "fasta"):
this_len = len(seq)
if this_len >= minlen_threshold:
if num_seqs == 0:
min_len = max_len = this_len
else:
min_len = min(this_len, min_len)
max_len = max(this_len, max_len)
num_seqs += 1
num_bases += this_len
if num_seqs > 0:
self.average = int(floor(float(num_bases) / num_seqs))
else:
self.average = None
self.num_seqs = num_seqs
self.num_bases = num_bases
self.min_len = min_len
self.max_len = max_len
return self

def pretty(self, filename):
'''Generate a pretty printable representation of a FastaStats object
suitable for output of the program. The output is a tab-delimited
string containing the filename of the input FASTA file followed by
the attributes of the object. If 0 sequences were read from the FASTA
file then num_seqs and num_bases are output as 0, and min_len, average
and max_len are output as a dash "-".
Arguments:
filename: the name of the input FASTA file
Result:
A string suitable for pretty printed output
'''
if self.num_seqs > 0:
num_seqs = str(self.num_seqs)
num_bases = str(self.num_bases)
min_len = str(self.min_len)
average = str(self.average)
max_len = str(self.max_len)
else:
num_seqs = num_bases = "0"
min_len = average = max_len = "-"
return "\t".join([filename, num_seqs, num_bases, min_len, average,
max_len])


def process_files(options):
'''Compute and print FastaStats for each input FASTA file specified on the
command line. If no FASTA files are specified on the command line then
read from the standard input (stdin).
Arguments:
options: the command line options of the program
Result:
None
'''
if options.fasta_files:
for fasta_filename in options.fasta_files:
logging.info("Processing FASTA file from %s", fasta_filename)
try:
fasta_file = open(fasta_filename)
except IOError as exception:
exit_with_error(str(exception), EXIT_FILE_IO_ERROR)
else:
with fasta_file:
stats = FastaStats().from_file(fasta_file, options.minlen)
print(stats.pretty(fasta_filename))
else:
logging.info("Processing FASTA file from stdin")
stats = FastaStats().from_file(sys.stdin, options.minlen)
print(stats.pretty("stdin"))


def init_logging(log_filename):
'''If the log_filename is defined, then
initialise the logging facility, and write log statement
indicating the program has started, and also write out the
command line from sys.argv
Arguments:
log_filename: either None, if logging is not required, or the
string name of the log file to write to
Result:
None
'''
if log_filename is not None:
logging.basicConfig(filename=log_filename,
level=logging.DEBUG,
filemode='w',
format='%(asctime)s %(levelname)s - %(message)s',
datefmt="%Y-%m-%dT%H:%M:%S%z")
logging.info('program started')
logging.info('command line: %s', ' '.join(sys.argv))


def main():
"Orchestrate the execution of the program"
options = parse_args()
init_logging(options.log)
print(HEADER)
process_files(options)


# If this script is run from the command line then call the main function.
if __name__ == '__main__':
main()
Loading

0 comments on commit a81b6dd

Please sign in to comment.