#!/usr/bin/env perl -w # phylofoot.pl # by Boris Lenhard # # See POD documentation for this script at the end of the file # use strict; use Getopt::Long; # for parsing command line arguments use Pod::Usage; use TFBS::DB::FlatFileDir; # DEFAULTS - presumes scripts are run from example/ directory # in the TFBS distribution # default values for optional parameters my $CONSERVATION_PERCENT = 70; my $MATRIX_THRESHOLD_PERCENT = 80; my $SLIDING_WINDOW = 50; # Get command line options - if you are curious how this # works, check the Getopt::Long module documentation. my ($database_dir, $alignment_file, $help); my @matrix_IDs = (); my $conservation = $CONSERVATION_PERCENT; my $threshold = $MATRIX_THRESHOLD_PERCENT; my $window = $SLIDING_WINDOW; GetOptions('help' => \$help, 'alignment_file=s' => \$alignment_file, 'database=s' => \$database_dir, 'matrix-id:s' => \@matrix_IDs, 'conservation:f' => \$conservation, 'threshold-score:f' => \$threshold, 'window-size:i' => \$window ); if($help) { pod2usage(-exitstatus=>0, -verbose=>2); } elsif (!($alignment_file and $database_dir)) { pod2usage(1); } # parse both the comma-separated lists and individually specified # matrix IDs: @matrix_IDs = split (",", join(',',@matrix_IDs)); # connect to FlatFileDir matrix database # (there is a sample FlatFileDir matrix database directory # examples/SAMPLE_FlatFileDir in the TFBS distribution package) # Change this line if you want to use a different type of database # (e.g. TFBS::DB::JASPAR2) my $db = TFBS::DB::FlatFileDir->connect($database_dir); # slurp the matrices into a TFBS::MatrixSet; my $matrixset; unless (scalar @matrix_IDs) { # if none are selected, we already have the set: $matrixset = $db->get_MatrixSet(-matrixtype=>"PWM"); # retrieves all matrices from the db } else { # otherwise we retrieve inly the requested ones $matrixset = $db->get_MatrixSet(-IDs => \@matrix_IDs, -matrixtype=>"PWM"); } # do the phylogenetic footprinting search of the sites my $sitepairset = $matrixset->search_aln(-file=>$alignment_file, -cutoff=>"$conservation", -threshold=>"$threshold\%", -windowsize => $window); # print it out in GFF format print $sitepairset->GFF; # ANd that's it. Instead of simply printing GFF, you can e.g. # - iterate through individual detected binding site pairs and extract # the information you need # - attach individual sittes as Bio::SeqFeature subclass objects # to bioperl's Bio::Seq objects # - collect the hit sequences and produce derived pattern matrices # using TFBS::Pa # - you tell me... # The rest is usage message if the user requests help # or fails to provide required parameters __END__ =head1 NAME phylofoot.pl - Phylogenetic footprinting example script =head1 SYNOPSIS ./phylofoot.pl -a -d [other_options] =head1 OPTIONS =over 8 =item B<-d or --database> REQUIRED: Name of the FlatFileDir database directory to use for retrieving matrices. A sample database directory examples/SAMPLE_FlatFileDir is available in TFBS distribution. =item B<-a or --alignment-file> REQUIRED: Name of the pairwise alignment file in Clustal format. A sample database directory examples/sample_alignment.aln is available in TFBS distribution. =item B<-m or --matrix-id> OPTIONAL: ID of the matrix from the database to scan the alignment with.You can specify multiple matrices using multiple -m switches or a single comma-separated lists of IDs (NO spaces - e.g. -m M00001,M00021,N01921 ). You can use a script called examples/list_matrices.pl in TFBS distribution to list information for all matrices in a matrix database of the FlatFileDir type. DEFAULT: If no matrix IDs are specified, all matrices in the database are used for the search; =item B<-w or --window-size> OPTIONAL: The width of sliding window for calculating the conservation profile of the submitted pairwise alignment. DEFAULT: If not specified, the default value is 50 (nucleotides). =item B<-c or --conservation> OPTIONAL: Conservation cutoff (in percent) for a region of multiple alignment to include detected conserved sites into output. DEFAULT: If not specified, the default value is 70 (%). =item B<-t or --threshold-score> OPTIONAL: Threshold score (in percent) for a matrix match to a subsequence. DEFAULT: If not specified, the default value is 80 (%). =back =head1 DESCRIPTION This is an example script that scans conserved regions of a pairwise DNA sequence alignment with a set of matrices form a flat file databases and produces GFF output. Its source code is meant to be studied by bioinformaticians who wish to learn how to use TFBS modules. =cut