TFBS::PatternGen
Gibbs
Summary
TFBS::PatternGen::Gibbs - a pattern factory that uses Chip Lawrences Gibbs program
Package variables
No package variables defined.
Included modules
Bio::Seq
Bio::SeqIO
File::Temp qw ( :POSIX )
Inherit
Synopsis
my $patterngen =
TFBS::PatternGen::Gibbs->new(-seq_file=>'sequences.fa',
-binary => '/Programs/Gibbs-1.0/bin/Gibbs'
-nr_hits => 24,
-motif_length => [8, 9, 10],
-additional_params => '-x -r -e');
my $pfm = $patterngen->pattern(); # $pfm is now a TFBS::Matrix::PFM object
Description
TFBS::PatternGen::Gibbs builds position frequency matrices
using an advanced Gibbs sampling algorithm implemented in external
Gibbs program by Chip Lawrence. The algorithm can produce
multiple patterns from a single set of sequences.
Methods
_parse_Gibbs_output | No description | Code |
_parse_raw_matrix | No description | Code |
_parse_raw_motif | No description | Code |
_run_Gibbs | No description | Code |
_site_props | No description | Code |
new | Description | Code |
Methods description
Title : new Usage : my $db = TFBS::PatternGen::Gibbs->new(%args); Function: the constructor for the TFBS::PatternGen::Gibbs object Returns : a TFBS::PatternGen::Gibbs object Args : This method takes named arguments; you must specify one of the following three -seq_list # a reference to an array of strings # and/or Bio::Seq objects # or -seq_stream # A Bio::SeqIO object # or -seq_file # the name of the fasta file containing # all the sequences Other arguments are: -binary # a fully qualified path to Gibbs executable # OPTIONAL: default 'Gibbs' -nr_hits # a presumed number of pattern occurences in the # sequence set: it can be a single integer, e.g. # -nr_hits => 24 , or a reference to an array of # integers, e.g -nr_hits => [12, 24, 36] -motif_length # an expected length of motif in nucleotides: # it can be a single integer, e.g. # -motif_length => 8 , or a reference to an # array ofintegers, e.g -motif_length => [8..12] -additional_params # a string containing additional # command-line switches for the # Gibbs program |
Methods code
_parse_Gibbs_output | description | prev | next | Top |
sub _parse_Gibbs_output
{ my ($self, $resultstring) = @_;
$resultstring =~ s/.*=== MAP MAXIMIZATION RESULTS ===//s;
my @raw_motifs = split /\-+\n\s+MOTIF \w\n/s, $resultstring;
shift @raw_motifs; foreach my $raw_motif (@raw_motifs) {
my $motif =$self->_parse_raw_motif($raw_motif) || next;
push @{ $self->{'motifs'} }, $motif;
}
return 1; } |
sub _parse_raw_matrix
{ my $raw_matrix = shift;
my @lines = split "\n", $raw_matrix;
my (@A, @C, @G, @T);
foreach my $line (@lines) {
my $value_string;
next unless ($value_string) = $line =~ /\s+\d+\s+\|\s+(.+)/;
$value_string =~ s/\./0/g;
my ($a, $t, $c, $g) = split /\s+/, $value_string;
push @A, $a;
push @C, $c;
push @G, $g;
push @T, $t;
}
return [\@A,\@ C,\@ G,\@ T];
}
1; } |
sub _parse_raw_motif
{ my ($self,$raw_motif) = @_;
my ($raw_matrix, $raw_bp, $length, $nr_hits, $MAP_score) =
$raw_motif =~ /Motif model \(residue frequency x 100\)\n(.+)Motif probability model\n.+Background probability model\n\s+(.+?)\n.+\D(\d+) columns\nNum Motifs\: (\d+).+Difference of Logs of Maps = ([\-\.\d]+)\n/s;
return undef unless $raw_matrix;
my $sites = $self->_site_props($raw_motif);
my $matrix = _parse_raw_matrix($raw_matrix);
return
TFBS::PatternGen::Gibbs::Motif->new
(-length => $length."",
-bg_probabilities => [split /\s+/, $raw_bp],
-MAP_score => $MAP_score,
-tags => {MAP_score => $MAP_score},
-nr_hits => $nr_hits,
-sites=>$sites,
-matrix => $matrix ); } |
sub _run_Gibbs
{ my $self = shift;
my $tmp_file = tmpnam();
my $outstream = Bio::SeqIO->new(-file=>">$tmp_file", -format=>"fasta");
foreach my $seqobj (@{ $self->{'seq_set'} } ) {
$outstream->write_seq($seqobj);
}
$outstream->close();
my $command_line =
$self->{'binary'}." ".
" -PBernoulli ".
$tmp_file." ".
$self->{'motif_length_string'}." ".
$self->{'nr_hits_string'}." ".
$self->{'additional_params'}." -n";
my $resultstring = `$command_line`;
$self->_parse_Gibbs_output($resultstring);
unlink $tmp_file;
return 1 } |
sub _site_props
{ my ($self,$raw_motif)=@_;
my @sites;
$raw_motif=~s/.*Num Motifs:\s+\d+\n//s;
$raw_motif=~s/\n\s+\*+.*//s; my @site_lines=(split("\n", $raw_motif));
foreach my $site(@site_lines){
my $start_seq;
my $end_seq;
my ($seq_nr,$pattern_nr,$start,$seq,$end,$score,$strand,$desc)=$site=~/\s+(\d+),\s+(\d+)\s+(\d+)\s+([\w,\s]+)\s+(\d+)\s+(\d\.\d+)\s+([F,R])(.*)/;
if ($strand eq "F"){
$strand=1;
$start_seq=$start;
$end_seq=$end;
}
else{
$strand=-1;
$start_seq=$end;
$end_seq=$start;
}
my $site = Bio::SeqFeature::Generic->new ( -start => $start_seq,
-end => $end_seq,
-strand => $strand,
-source => 'Gibbs sampler',
-score => $score,
);
$site->attach_seq ($self->{'seq_set'}->[$seq_nr-1]);
push (@sites,$site);
}
foreach my $site(@sites){
}
return\@ sites; } |
sub new
{ my ($caller, %args) = @_;
my $self = bless {}, ref($caller) || $caller;
$self->{'motif_length_string'} =
($args{'-motif_length'}
? (ref($args{'-motif_length'})
? join(',', @{$args{'-motif_length'}})
: $args{'-motif_length'})
: 8 );
$self->{'nr_hits_string'} =
($args{'-nr_hits'}
? (ref($args{'-nr_hits'})
? join(',', @{$args{'-nr_hits'}})
: $args{'-nr_hits'})
: "" );
$self->{'additional_params'} =
($args{'-additional_params'}
? (ref($args{'-additional_params'})
? join(' ', @{$args{'-additional_params'}})
: $args{'-additional_params'})
: "" );
$self->{'binary'} = $args{'-binary'} || 'Gibbs';
$self->{'motifs'} = [];
$self->_create_seq_set(%args) or die ('Error creating sequence set');
$self->_run_Gibbs() or $self->throw("Error running Gibbs.");
return $self; } |
General documentation
The three methods listed above are used for the retrieval of patterns,
and are common to all TFBS::PatternGen::* classes. Please
see
TFBS::PatternGen for details.