-
Notifications
You must be signed in to change notification settings - Fork 3
/
bs-seq.pl
executable file
·435 lines (317 loc) · 15.8 KB
/
bs-seq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/usr/bin/env perl
use warnings;
use strict;
use Data::Dumper;
use Carp;
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use File::Spec;
use File::Path;
use File::Basename;
# SIGINT trap. Ctrl-c triggers unclean exit.
$SIG{INT} = sub {croak "Received SIG$_[0]. Exiting...\n"};
my $left_read; # required
my $right_read; # required
my $reference; # required
my $base_name = 'out';
my $overwrite;
my $read_size = 76; # length of the reads
my $library_size = 300; # for paired ends, approx size of the reads. For single, use 0
my $mismatches = 2; # num mismatches to allow for bowtie
my $organism = q{.}; # label for collectStats.
my $batch = 1; # label for collectStats.
my $window_size = 50;
my $trust_dash_2 = 0;
my $single_ends = 1;
my @left_splice;
my @right_splice;
my @groups = (); # sequences (chr1, chr2, ...)
my $aligner = 'bowtie';
my $max_hits = 0; # cutoff for bowtie alighnment before discarding
my $random_assign = 1; # if there is multiple reconciliation between ends, use one randomly. Otherwise discard.
my $pthreads = 1; # not used... for bowtie.
my $di_nuc_freqs = 0; # if true, CG, CT, CC, CA instead of CHH, CHG, CG.
my @contexts; # CG, CHG, CHH, etc.
my @date = localtime (time);
my $out_dir = File::Spec->catdir (
File::Spec->curdir(),
sprintf "DZ_full-run_%4d-%02d-%02d_%02d.%02d.%02d",
$date[5] + 1900, $date[4] + 1, $date[3], $date[2], $date[1], $date[0]
);
# Grabs and parses command line options
my $result = GetOptions (
'left-read|l=s' => \$left_read,
'right-read|r=s' => \$right_read,
'reference|f=s' => \$reference,
'base-name|b=s' => \$base_name,
'overwrite|o' => \$overwrite,
'read-size|s=i' => \$read_size,
'library-size|k=i' => \$library_size,
'mismatches|n=i' => \$mismatches,
'organism|t=s' => \$organism,
'batch|i=i' => \$batch,
'window-size|w=i' => \$window_size,
'trust-dash-2|2=i' => \$trust_dash_2,
'single-ends|1=i' => \$single_ends,
'left-splice|ls=i{2}' => \@left_splice,
'right-splice|rs=i{2}' => \@right_splice,
'groups|g=s{,}' => \@groups,
'out-directory|d=s' => \$out_dir,
'aligner|a=s' => \$aligner,
'max-hits|mh=i' => \$max_hits,
'random-assign|rnd=i' => \$random_assign,
'pthreads|pt=i' => \$pthreads,
'di-nuc-freqs|dnf=i' => \$di_nuc_freqs,
'contexts|ct=s{,}' => \@contexts,
'verbose|V' => sub { use diagnostics; },
'quiet|q' => sub { no warnings; },
'help|h' => sub { pod2usage ( -verbose => 1 ); },
'manual|m' => sub { pod2usage ( -verbose => 2 ); }
);
# Check required command line parameters
pod2usage ( -verbose => 1 )
unless $result and $left_read and $right_read and $reference;
# Get all chromosomes, pseudo-chromosomes, groups, etc, in the fasta reference file
# Discards all information after first blank character in fasta header
unless (@groups) {
open my $REFERENCE, '<', $reference or croak "Can't open $reference: $!";
while (<$REFERENCE>) {
if (m/>/) {
tr/A-Z/a-z/;
m/>([^\s]+)/ && push @groups, $1;
}
}
close $REFERENCE or carp "Can't close $reference: $!";
}
@left_splice = (1, $read_size) unless @left_splice;
@right_splice = (1, $read_size) unless @right_splice;
# check whether output directory should be created, exists, should overwritten
if (-d $out_dir and $overwrite) {rmtree ( $out_dir, {keep_root => 1} )}
elsif (! -d $out_dir) {
mkpath ( File::Spec->catfile ($out_dir, 'windows'), {verbose => 1} );
mkpath ( File::Spec->catfile ($out_dir, 'single-c'), {verbose => 1} );
}
else {warn " overwrite $out_dir"}
unless (@contexts) {
if ($di_nuc_freqs) {@contexts = qw(CA CC CG CT)}
else {@contexts = qw(CG CHG CHH)}
}
my %files = (
# left fasta, and bisulfite treated
lfa => File::Spec->catfile ($out_dir, basename($left_read)) . '.fa',
lc2t => File::Spec->catfile ($out_dir, basename($left_read)) . '.c2t',
# right fasta, and bisulfite treated
rfa => File::Spec->catfile ($out_dir, basename($right_read)) . '.fa',
rg2a => File::Spec->catfile ($out_dir, basename($right_read)) . '.g2a',
# left/right eland3, from bowtie
lel3 => File::Spec->catfile ($out_dir, basename($left_read)) . "_$left_splice[0]-$left_splice[1].eland3",
rel3 => File::Spec->catfile ($out_dir, basename($right_read)) . "_$right_splice[0]-$right_splice[1].eland3",
# output of correlated paired ends
base => File::Spec->catfile ($out_dir, $base_name) . '.gff',
# output of collect align states
log => File::Spec->catfile ($out_dir, $base_name) . '.log',
# <output dir>/basename-sequence.gff
split => _gen_files (File::Spec->catfile ($out_dir, $base_name), 'gff', @groups),
# <output dir>/single-c/basename-sequence.single-c.gff
freq => _gen_files (File::Spec->catfile ($out_dir, 'single-c', $base_name), 'single-c.gff', @groups),
# <output dir>/single-c/basename-sequence.single-c-CG.gff,
# <output dir>/single-c/basename-sequence.single-c-CHG.gff, etc.
cont => [map { _gen_files (File::Spec->catfile ($out_dir, 'single-c', $base_name), "single-c-$_.gff", @groups) } @contexts],
# <output dir>/windows/basename-sequence.w50-CG.gff,
# <output dir>/windows/basename-sequence.w50-CHG.gff, etc.
wcont => [map { _gen_files (File::Spec->catfile ($out_dir, 'windows', $base_name), "w${window_size}-$_.gff", @groups) } @contexts],
);
# convert reads
run_cmd ("perl -S fq_all2std.pl fq2fa $left_read > $files{lfa}") unless file_exists($files{lfa});
run_cmd ("perl -S convert.pl c2t $files{lfa} > $files{lc2t}") unless file_exists($files{lc2t});
unless ($single_ends) {
run_cmd ("perl -S fq_all2std.pl fq2fa $right_read > $files{rfa}") unless file_exists($files{rfa});
run_cmd ("perl -S convert.pl g2a $files{rfa} > $files{rg2a}") unless file_exists($files{rg2a});
}
# convert genomes
run_cmd ("perl -S rcfas.pl $reference > $reference.rc") unless file_exists("$reference.rc");
run_cmd ("perl -S convert.pl c2t $reference.rc > $reference.c2t") unless file_exists("$reference.c2t");
run_cmd ("perl -S convert.pl g2a $reference.rc > $reference.g2a") unless file_exists("$reference.g2a") or $single_ends;
# align with bowtie
run_cmd ("bowtie-build $reference.c2t $reference.c2t") unless file_exists("$reference.c2t.1.ebwt");
run_cmd ("bowtie-build $reference.g2a $reference.g2a") unless file_exists("$reference.g2a.1.ebwt") or $single_ends;
my $l3trim = $read_size - $left_splice[1];
my $l5trim = $left_splice[0] - 1;
my $r3trim = $read_size - $right_splice[1];
my $r5trim = $right_splice[0] - 1;
# align with bowtie
run_cmd ("bowtie $reference.c2t -f -B 1 -v $mismatches -5 $l5trim -3 $l3trim --best" . ($max_hits ? " --strata -k $max_hits -m $max_hits" : q{}) . " --norc $files{lc2t} $files{lel3}" ) unless file_exists($files{lel3});
unless ($single_ends) {
run_cmd ("bowtie $reference.g2a -f -B 1 -v $mismatches -5 $r5trim -3 $r3trim --best" . ($max_hits ? " --strata -k $max_hits -m $max_hits" : q{}) . " --norc $files{rg2a} $files{rel3}" ) unless file_exists($files{rel3});
}
else {
run_cmd ("bowtie $reference.c2t -f -B 1 -v $mismatches -5 $r5trim -3 $r3trim --best" . ($max_hits ? " --strata -k $max_hits -m $max_hits" : q{}) . " --norc $files{lc2t} $files{rel3}" ) unless file_exists($files{rel3});
}
# get back original non-converted reads and convert from bowtie to eland3
run_cmd ("perl -S parse_bowtie.pl -u $files{lfa} -s @left_splice $files{lel3} -o $files{lel3}.post") unless file_exists("$files{lel3}.post");
unless ($single_ends) {
run_cmd ("perl -S parse_bowtie.pl -u $files{rfa} -s @right_splice $files{rel3} -o $files{rel3}.post") unless file_exists("$files{rel3}.post");
}
else {
run_cmd ("perl -S parse_bowtie.pl -u $files{lfa} -s @right_splice $files{rel3} -o $files{rel3}.post") unless file_exists("$files{rel3}.post");
}
# make sure reads map together
run_cmd ("perl -S correlatePairedEnds.pl -l $files{lel3}.post -r $files{rel3}.post -ref $reference -o $files{base} -t 0 -d $library_size -s $read_size -2 $trust_dash_2 -1 $single_ends -m $max_hits -a $random_assign") unless file_exists($files{base});
# basic stats about the aligment
run_cmd ("perl -S collect_align_stats.pl $files{lel3}.post $files{rel3}.post $files{base} $organism $batch > $files{log}") unless file_exists($files{log});
# quantify methylation
for (@groups) {
run_cmd ("perl -S split_gff.pl --sequence all $files{base}") unless (file_exists($files{split}->{$_}));
run_cmd ("perl -S countMethylation.pl --ref $reference --gff $files{split}->{$_} --output $files{freq}->{$_} --sort -d $di_nuc_freqs") unless file_exists($files{freq}->{$_});
}
# window methylation counts into non-overlapping windows
for my $context (0 .. @contexts - 1) {
for my $group (@groups) {
run_cmd ("perl -S split_gff.pl --feature all $files{freq}->{$group}")
unless file_exists($files{cont}->[$context]{$group});
run_cmd ("perl -S window_gff.pl $files{cont}->[$context]{$group} --width 1 --step 1 --output $files{cont}->[$context]{$group}.merged")
unless file_exists("$files{cont}->[$context]{$group}.merged");
run_cmd ("perl -S window_gff.pl $files{cont}->[$context]{$group}.merged --width $window_size --step $window_size --output $files{wcont}->[$context]{$group} --no-skip")
unless file_exists($files{wcont}->[$context]{$group});
}
}
### DONE
sub _error {
my ($e) = @_;
if ($e == -1) {
return "failed to execute: $!\n";
}
elsif ($e & 127) {
return sprintf "child died with signal %d, %s coredump\n",
($e & 127), ($e & 128) ? 'with' : 'without';
}
else {
return sprintf "child exited with value %d\n", $e >> 8;
}
}
sub run_cmd {
my ($cmd) = @_;
warn "-- CMD: $cmd\n";
eval {system ("$cmd") == 0 or croak _error( $? )};
croak "** failed to run command '$cmd': $@" if $@;
}
sub _gen_files {
my ($base, $ext, @groups) = @_;
my %split_files;
for my $group (@groups) {
$split_files{$group} = "$base-$group.$ext";
}
return \%split_files;
}
sub file_exists {
my $file = shift;
print "$file exists: skipping..." && return 1 if -f $file and -s $file;
return 0;
}
=head1 NAME
bs-seq.pl - Run dzlab's bisulfite sequencing analysis pipeline
=head1 SYNOPSIS
Paired ends example, where s_7_1_sequence.txt is the left read and s_7_2_sequence is the right.
bs-seq.pl --left-read s_7_1_sequence.txt --right-read s_7_2_sequence.txt --reference REF_COL.fa --base-name test-out --read-size 45 --library-size 300 --mismatches 2 --organism leco --batch 1 --left-splice 1 40 --right-splice 5 45
Single ends example. Notice the left and right reads are the same file.
bs-seq.pl -l reads.fastq -r reads.fastq -f /path/to/genomes/TAIR_reference.fas -b bsseqrun -s 76 -k 0 -n 2 -t Arabidopsis -i 1 -ls 1 45 -rs 46 76 -1 1 -2 0 -rnd 0 -mh 10 -d output dir
=head1 DESCRIPTION
=head1 OPTIONS
=over 1
=item --reference <fasta> | --f <fasta>
Reference genome file in Fasta format. Required.
=item -l <fastq> | --left-read <fastq>
Left reads file in fastq format. For single ends, -l and -r should be the same. Required.
=item -r <fastq> | --right-read <fastq>
Left reads file in fastq format. For single ends, -l and -r should be the same. Required.
=item -l <start> <end> | --left-splice <start> <end>
Start and end coordinate for the chunk of the --left-read fastq file to use for the left alignment. Required.
For example, if you are doing single ends with 1-45 and 46-76, use "-ls 1 45".
=item -r <start> <end> | --right-splice <start> <end>
Start and end coordinate for the chunk of the --right-read fastq file to use for the right alignment. Required.
For example, if you are doing single ends with 1-45 and 46-76, use "-rs 46 76".
=item -s <len> | --read-size <len>
Length of each read. 76, 100, etc. Required.
=item -b <label> | --base-name <label>
Label for the file names... can be anything. Choose something descriptive. Required
=item -d | --out-directory
Directory to put all result files. Required.
=item -o <boolean> | --overwrite <boolean>
Whether to overwrite the --directory if it already exists. Default 0.
=item -k <len> | --library-size <len>
Approx length of the molecules in PAIRED ends. Default 300. For single ends, use 0. Ie, for paired ends, this
parameter helps deal with possible inserts between the left and right reads:
|------------------------------------| (original read, paired end)
|-----------> 1', aligned to c2t
|---------| insert
<------------| 2', aligned to g2a
For single ends, there isn't an insert use 0.
|----------------------| (original read, single end)
|-----------> 1'
| insert (size 0)
|--------> 2' (simulated. compared against c2t just like 1')
=item -t <orgname> | --organism <orgname>
Label for collect_align_stats.pl (the .log file produced). Arabidopsis, Rice, Puffer, etc.
=item -w <size> | --window-size <size>
Default 50, for windowing single-c files.
=item -1 <boolean> | --single-ends <boolean>
1 if single ends, 0 if paired. Default 1.
=back
=head2 BOWTIE OPTIONS
=over
=item -m <hits> | --max-hits <hits>
Discards reads that map to the genome more the this many times, passed to bowtie. In repetitive sections of the genome,
reads can potentially map hundreds of times, so this helps us filter repetitive chunks out.. Defaults to 0 for no filtering.
Daniel says 10 is a good number to use.
=item -n <num> | --mismatches <num>
For bowtie. For each read alignment, allow this many mismatches. Default 2.
=back
=head2 Ends CORRELATION OPTIONS
=over
=item -r | --random-assign
For correlatedPairedEnds.pl. When there are multiple possible reconciliations between the left and right alignments,
assign one randomly. For Arabidopsis and other organisms with lower levels of repetitive sequences, use 0. For maize
and similar, use 1. Default 1.
=item -2 <boolean> | --trust-dash-2 <boolean>
For correlatedPairedEnds.pl. If 1, when the dash-2 reads align when the dash-1 reads don't, KEEP the read. This
should be 0 for single ends since the downstream portion (right reads) of each read will be lower quality than the
upstream (left reads). Default 0.
=back
=head2 Less frequently used options
=over
=item -d | --di-nuc-freqs
When this is 0, calculate the CG, CHH and CHG contexts. If 1, calculate CG, CA, CT, CC. Default 0.
=item -i <num> | --batch <num>
Label for collect_align_stats.pl (the .log file produced). Default to 1.
=back
=over
=item -V | --verbose
=item -q | --quiet
=item -h | --help
=item -m | --manual
=back
=head1 REVISION
Version 0.0.1
$Rev: 440 $:
$Author: psilva $:
$Date: 2010-11-11 16:15:45 -0800 (Thu, 11 Nov 2010) $:
$HeadURL: http://dzlab.pmb.berkeley.edu/svn/bisulfite/trunk/bs-seq.pl $:
$Id: bs-seq.pl 440 2010-11-12 00:15:45Z psilva $:
=head1 AUTHOR
Pedro Silva <pedros@berkeley.edu/>
Zilberman Lab <http://dzlab.pmb.berkeley.edu/>
Plant and Microbial Biology Department
College of Natural Resources
University of California, Berkeley
=head1 COPYRIGHT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut