#!/usr/bin/env perl

#$Id$

# Copyright  2003, Stowers Institute for Medical Research.  All
# rights reserved.

# Redistribution and use in source and binary forms, with or without
# modifications, are permitted provided that the following conditions
# are met:

# Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.

# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.

# Neither the name of Stowers Institute for Medical Research nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE, ARE DISCLAIMED.  IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILTY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# Written by Malcolm Cook, Stowers Institute for Medical
# Research. Please submit all inquiries regarding this software to
# Malcolm Cook, Stowers Institute for Medical Research, 1000 East 50th
# Street, Kansas City, MO 64110, mec@stowers-institute.org.

=pod

=head1 SYNOPSIS

gff2das --man | --help | --verbose | --version

gff2das --preferred_group g1,g2...gn   --gffversion N features.gff > features.das

gff2das --groupclass MyGroup  --gffversion 1  v1features.gff > v1features.das

Translate GFF lines to 'DAS [annotations]' format such as are used by
DAS/Genboree.  Adheres to required interpretation of GFF per
documentation in Bio::DB::GFF.

=head1 OPTIONS

Options are specified on the command line.  Their case is ignored, and
they may be abbreviated to uniqueness (i.e. --h instead of --help).

=over

=item B<--gffversion N>

Is only used to indicate how to parse attribute value pairs in column
9.

Version 3 expects an '=' between attribute and value.

It is overridden by any '##gff-version' line appearing in the input as
allowed by GFF spec.

=item B<--preferred_group>

controls which attribute in column 9 to promote to the DAS group name
and class in column 1 and 2.  It is overridden by any '##group-tags'
line in the GFF input (c.f. GFF spec).  They default to 'Sequence' and
'Transcript' since, per Bio::DB::GFF: 'For backward compatibility, the
tags Sequence and Transcript are always treated as grouping tags
unless preferred_tags are specified.'

Also, pre Bio::DB::GFF spec: '"Target" tag is always used for grouping
regardless of the preferred_groups() setting'... and it is expected to
be followed by either a target name (placed in output column 2 as
group name if present) and 2 match location values, or just the match
location values, which in either case are placed in DAS tstart and
tend columns.

=item B<--groupclass>

only valid with gffversion = 1 where column 9 holds a single value,
namely, a 'group'.  This option specifies the common group class,
which defaults to '.'.  

=item B<--help>

Display command line usage with options.

=item B<--man>

Display complete manual page and exit.

=item B<--verbose>

Provides a trace of processing on STDERR.

=item B<--version> 

Display the scripts version number and exit.

=back

=head1 DESCRIPTION

translate GFF lines to 'DAS [annotation]' format such as are used by
DAS/Genboree.  Handle group semantics in accordance with GFF
documentation in Bio::DB::GFF

GFF validation is very loose - if it has 9 columns it is treated as
GFF and parsed.

=head1 EXAMPLES

=over

=item C< gff2das --man >

print a manpage

=item C< gff2das --preferred_group 'TFBS,TF' --gffversion 3  gff2das.test.gff >

when TFBS or TF appear as attribute name in column 9 of the GFF,
promote it to the GFF class (favoring TFBS if both are present).

=back

=head1 VERSION

$Revision:  0.01$

=head1 AUTHOR

Malcolm Cook (mec@stowers-institute.org)

=head1 DEPENDENCIES

perl

=head1 AVAILABILITY

http://research.stowers-institute.org/mec/software/scripts/gff2das

=head1 TO DO

=over

=item handle escaping and quoting in column 9.

=cut

use warnings;
use strict;

our $VERSION =  qw$Revision: 0.01 $[-1];
our $VC_DATE =  qw$Date: $[-2];

use Getopt::Long;
use Pod::Usage;
use FindBin qw($Script);
use Data::Dumper;

my $man = 0;
my $help = 0;
my $verbose;
my $version;

my @preferred_group;
my $gffversion;
my $groupclass;

GetOptions('help|?'            => \$help,
	   'man!'              => \$man,
	   'verbose!'          => \$verbose,
           'version!'          => \$version,
	   'preferred_group:s' => \@preferred_group,
	   'gffversion:s'      => \$gffversion,
	   'groupclass:s'      => \$groupclass,
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
if ($version) {print "$FindBin::Script: $VERSION\n"; exit(0)};

$gffversion ||= 2.5;		# aka GTF format.
#die "groupclass only valid with gff_version = 1" if $groupclass && $gffversion ne 1;
$groupclass ||= '.';

@preferred_group = split ',' , join "," , @preferred_group;	# allow comma delimited list or multiple options

# Implement Bio::DB::GFF - 'For backward compatibility, the tags
# Sequence and Transcript are always treated as grouping tags unless
# preferred_tags are specified.'
@preferred_group = qw(Sequence Transcript) unless @preferred_group;

use List::Util qw(first);
my ($group_class, $group_name, $tstart, $tend, %attribute, @attributes);

while (<>) {
  chomp;
  if (m/##group-tags\s*(.*)/) {
    # implement Bio::DB::GFF - 'The GFF file itself can specify which tags are to be
    # used for grouping. Insert a comment like the following:
    #         ##group-tags Accession Locus
    @preferred_group = split /\s+/,$1;
    next;
  } elsif (m/##gff-version\s*([\d\.]+)/) {
    $gffversion = $1;
  }
  my @F = split "\t";
  next unless 8 == $#F;		# 9 fields likely to be GFF - let's assume it is!
  my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$attributes) = @F;
  $attributes = "$groupclass $attributes" if $groupclass && $gffversion eq 1;
  @attributes = split /\s+;\s+/, $attributes;
  if ($gffversion eq 3) {
    @attributes = map {split /\s*=\s*/ , $_,2 } @attributes;
  } else {
    @attributes = map {split /\s+/     , $_,2 } @attributes;
  }
  
  %attribute =  @attributes;
  # implement Bio::DB::GFF :  '"Target" tag is always used for grouping regardless of the preferred_groups() setting'...
  $group_class = first {exists $attribute{$_}} ('Target', @preferred_group);
  $group_class ||= $attributes[0];
  # implement Bio::DB::GFF - 'the tags "tstart", "tend" and "Note" cannot be used for grouping'
  undef($group_class) if $group_class =~ m/tstart|tend|Note/i;
  $group_name = defined($group_class) ? $attribute{$group_class} : undef;
  if ($group_class eq 'Target') {
    ($group_name, $tstart, $tend) = split /\s+/, $group_name;
  } else {
    ($tstart, $tend) = ('.','.');
  }
  delete $attribute{$group_class};
  $attributes = join " ; ", map {"$_ $attribute{$_}"} keys(%attribute);
  $_ = join "\t", ($group_class, $group_name, $method, $source, $ref, $start, $stop, $strand, $phase, $score, $tstart, $tend, $attributes);
} continue {
  print "$_\n" ;
}

exit 0;
