From 62c2abd617ab965639df00be9b5d7b8625ffcdbf Mon Sep 17 00:00:00 2001 From: bdklahn Date: Wed, 10 Aug 2011 16:39:30 -0400 Subject: [PATCH] was CG9.py. limits seq repeats from fastq files. arg1=trunc to number, arg2=repeated string, arg3=filename. --- trunc_rep.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 trunc_rep.py diff --git a/trunc_rep.py b/trunc_rep.py new file mode 100755 index 0000000..3066152 --- /dev/null +++ b/trunc_rep.py @@ -0,0 +1,25 @@ +#! /usr/bin/env python +import sys, screed, re + +# "rep" = "repeat" + +rep_num = int(sys.argv[1]) +rep_str = sys.argv[2] + +trunc_rep = rep_str * rep_num + +rep_pat_txt = trunc_rep + rep_str + '+' +rep_pat = re.compile (rep_pat_txt) + +for record in screed.open(sys.argv[3]): + name = record.name + sequence = record.sequence + qual = record.accuracy + + for match in rep_pat.finditer(sequence): + qual_chop_end = match.start() + rep_num * len(rep_str) + qual = qual[0:qual_chop_end] + qual[match.end():] + + sequence = rep_pat.sub(trunc_rep, sequence) + + print '@%s\n%s\n+\n%s' % (name, sequence, qual)