#!/usr/local/bin/perl -w =pod seq2ssd v. 23 Jul 1996, Dave Schweisguth Converts a sequence file (which defines primary and secondary structure) into an ssd ("Spin System Database") to be used by other CHURN programs Input file format: ggcagggcucauaacccugcc pair 1 21 pair 2 20 pair 3 19 pair 4 18 pair 5 17 pair 6 16 pair 7 15 or # ITALY2 stem segment a ggcaggg segment b cccugcc pair a1 b7 pair a2 b6 pair a3 b5 pair a4 b4 pair a5 b3 pair a6 b2 pair a7 b1 or # Fragment I segment IV residue 1 ugccuggcgg segment III residue 69 gccgaugguagugugggg segment I residue 90 ccccaugcgagaguagggaacugccaggcau pair IV 1 I 119 pair IV 2 I 118 # etc. pair III 70 I 106 pair III 71 I 105 # etc. Blank lines and comments (lines beginning with #, ! or ;) are ignored. Segment names may contain one or more letters and/or underscores, no digits or hyphens. They are case-sensitive. Segment definitions are optional. Without them, the entire molecule is segment a. Defining a new segment name resets the residue number to 1. Residue numbers are integers. Residue number definitions are optional. Without them, numbering begins at 1 and increases by 1 per residue. Base pairs are optional. Segment names in base pair definitions are optional. A residue number not preceded by a segment name is assumed to be in the current segment. Whitespace between a segment name and the following residue number is optional. Sequence elements must precede base pairs which include them. Case is ignored except in segment names. ssd ("Spin System Database") structure: %ssd = { # ssd structure segs => { # Hash of segments $seg => { # Hash of residues $res => { # Residue structure type => Residue type spins => { # Hash of spins $spin => Chemical shift ... }, }, ... }, ... }, pairs => [ # Array of base pairs [ # Base pair structure seg1, # Segment of first partner res1, # Residue of first partner seg2, # Segment of first partner res2 # Residue of first partner ], ... ] }; To do: - Allow specification of a segment ID in a residue number definition. - Support type-segment-number-spin syntax? =cut ### Preliminaries require 5.002; # Perl 5.002 required use strict; # Require optional-but-desirable practices use vars qw($whatami); # Exempt globals from 'use strict' use Churn qw(%spins store_fd); use Ref qw(copyref); ### Parameters # Environment ($whatami = $0) =~ s|.*/||; # `basename $0` # Configuration my $seg = 'a'; # Default segment name my $res = 1; # Default first residue number # Initialization (don't change these) my $segd = 1; my $types = join('', keys %spins); my(%ssd, $line, @line, $pos, $char, $was_seg, $base, $pair); ### Arguments and error-checking # Parse args my($arg, $sign, $first, $rest); while (@ARGV and ($sign, $first, $rest) = ($ARGV[0] =~ /^([\-+])(.)(.*)/)) { if ($sign eq '+' && $first !~ /[\0]/) { # -/+ switches (none at the moment) &usage("$sign$first is not an option.\n"); } if ($first =~ /[\0]/) { # Switches with arguments (none at the moment) shift; $arg = $rest ne '' ? $rest : @ARGV ? shift : &usage("$sign$first requires an argument.\n"); } elsif ($rest eq '') { shift; } else { $ARGV[0] = "$sign$rest"; } if ($first eq 'u') { &usage(0); } else { &usage("$sign$first is not an option.\n"); } } sub usage { warn $_[0] ? "$whatami: $_[0]" : '', <) { next if $line =~ /^[#!;]/; @line = split(' ', $line); while($_ = shift @line) { if (/^seg/i) { if ($segd && keys %{$ssd{segs}}) { warn "$whatami: At $ARGV line $., the first segment is defined in mid-molecule.\n"; } $segd = 0; $seg = shift @line; $res = 1; } elsif (/^res/i) { $res = shift @line; if ($res !~ /^-?\d+$/) { die "$whatami: $ARGV line $.: $res is not an integer!\n"; } } elsif (/^[$types]*$/i) { foreach $pos (0 .. length($_) - 1) { $char = lc(substr($_, $pos, 1)); $ssd{segs}{$seg}{$res}{type} = $char; $ssd{segs}{$seg}{$res}{spins} = $spins{$char}; $res++; } } elsif (/^pair/i) { # Fill @pair with words, splitting if necessary my @pair = (); # Get a new reference to an empty array $was_seg = 0; # 0 if the previous element of @pair was a # res, 1 if it was a seg while (@pair < 4) { $_ = shift @line; if (/^([a-z_]+)$/i) { if (! $was_seg) { push(@pair, $1); $was_seg = 1; } else { die "$whatami: $ARGV line $.: $1 should have been a residue number!\n"; } } elsif (/^(-?\d+)$/) { if ($was_seg) { push(@pair, $1); } else { push(@pair, $seg, $1); # If seg is omitted, use the # current seg } $was_seg = 0; } elsif (/^([a-z_]+)(-?\d+)$/i) { if (! $was_seg) { push(@pair, $1, $2); } else { die "$whatami: $ARGV line $.: $1 should have been a residue number!\n"; } } else { die "$whatami: $ARGV line $.: $_ can't be part of a pair definition!\n"; } } # Test that segments and residues in pair have been defined foreach $base (0, 2) { if (! defined $ssd{segs}{$pair[$base]}) { die "$whatami: $ARGV line $.: segment $pair[$base] does not yet exist.\n"; } if (! defined $ssd{segs}{$pair[$base]}{$pair[$base + 1]}) { die "$whatami: $ARGV line $.: segment $pair[$base] residue $pair[$base + 1] does not yet exist.\n"; } } # Test for pair to self if ($pair[0] eq $pair[2] && $pair[1] eq $pair[3]) { die "$whatami: $ARGV line $.: segment $pair[0] residue $pair[1] is paired to itself.\n"; } # Test for repeat or inverted repeat of previous pairs foreach $pair (@{$ssd{pairs}}) { &array_eq(\@pair, $pair) && die "$whatami: $ARGV line $.: Duplicate pair!\n"; &array_eq([@pair[2, 3, 0, 1]], $pair) && die "$whatami: $ARGV line $.: Duplicate pair (inverted)!\n"; } push(@{$ssd{pairs}}, \@pair); } else { die "$whatami: $ARGV line $.: $_ means nothing to me!\n"; } } } # Run %ssd through copyref before storing so that each residue has its own # spins array (instead of all residues of each type having references to the # same spins array) &store_fd(©ref(\%ssd), 'STDOUT'); exit; ### Subroutines # Return 1 if arrays are equal, 0 otherwise sub array_eq { # Return 0 if either parameter is not a ref to an array if (ref $_[0] ne 'ARRAY' || ref $_[1] ne 'ARRAY') { return 0 }; # Arrays are equal if references are equal if ($_[0] eq $_[1]) { return 1 }; # Arrays are unequal if sizes are unequal if (@{$_[0]} != @{$_[1]}) { return 0 }; # Arrays are unequal if any corresponding elements are unequal foreach (0 .. $#{$_[0]}) { if ($_[0]->[$_] ne $_[1]->[$_]) { return 0 }; } 1; }