#!/usr/bin/gawk -f
###########################################################################
# This file is part of meadTools, version 2.2.
# 
# Copyright (c) 2001-2019, Instituto de Tecnologia Quimica e Biologica,
# Universidade Nova de Lisboa, Portugal.
# 
# meadTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 2 of the License, or (at your
# option) any later version.
# 
# meadTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with meadTools.  If not, see <http://www.gnu.org/licenses/>.
# 
# For further details and info check the README file.
# 
# You can get meadTools at www.itqb.unl.pt/simulation
###########################################################################


############################################################################
# makesites: a program to create a .sites file from a .pqr file.
#
# It can produce output for single-state sites (intended for standard
# multiflex use) or multiple-state (ie, tautomeric) sites (intended
# for meadT use).  Besides the standard aminoacid sidechain sites, it
# recognizes acetate (ACE), heme (HEM), and, in the tautomeric case,
# water (HOH, H2O, WAT or SOL).
#
# Several more or less arguable assumptions are made:
#
# Ii is assumed that three main types of .st files are used, namely
# the ones distributed with meadTools: "tau*" (ie, tau1, tau2, etc)
# for tautomeric sites, and "all" and "avx" for single-state sites.
#
# The recognition of N- and C-terminal sites is tentative, and must
# _always_ be checked! More exactly, an N-terminus is assigned if a
# standard aminoacid residue is the first or follows a C-terminal
# site, and has an atom named "N"; this can miss non-standard residues
# or make a wrong assignment after a capping fragment (eg, formyl).
# The recognition of the C-terminal relies on the use of the
# _non-standard_ atom name "CT"; this means that you can _not_ forget
# to change it!
#
# In the single-state case it is assumed that all .st are of the "avx"
# type, except for TYR and CYS, where it is assumed that the most
# usual is to have "really" a location for the proton, whose .st files
# we usualy name as "all".  Another solution would be to assume that
# "protonated" forms without protons are used for these sites (meaning
# you could _not_ forget to remove the protons from the .pqr file!);
# this was not done.
#
# In the single-state case, HIS, HISA and HISB are treated
# differently, assuming that if you call HISA or HISB to your residue
# it is because you "really" know where the proton is in the neutral
# form, otherwise you would call it just HIS.  Note that HISA and HISB
# are _not_ recognized in the tautomeric case, where they make no
# sense in general.
#
# If the above assumptions for TYR, CYS and HIS in the single-state
# case do not seem to make sense in your system is because you should
# probably be using tautomers...
#
# A distance-based test is made for SS-bonds in CYS residues, and
# titrable CYS chosen accordingly.  The results of this "educated
# guess" are written to stderr as warnings (as many other checking
# stuff).  Note that residues called CYS1, CYS2, CYSB, CYSC, etc,
# normally used for SS-bonded CYS, are not treated in this way, being
# simply ignored.  In contrast, CYSH causes a warning (as do ASPH,
# GLUH, etc), since only CYS could produce a titrable site.
#
# Finally, all the above assumptions rely implicitly on some
# particular form for the .st files (which is not devoid of
# ambiguities: eg, should the *avx.st files have "empty" protons or
# not?).  In case of doubt always check the .st files directly.
############################################################################


BEGIN{
  cmd = "makesites" ;
  usage = "Usage: "cmd" s|t PQR_FILE\n" \
    "Options: s = single | t = tautomers\n" \
    "Afterwards check sites: bridged CYS, NT, CT, 'avx'.\n" \
    "Note that all 'single' sites are of 'avx' type, except TYR and CYS." ;
  if (ARGC != 3) error("Wrong number of arguments.\n" usage) ;
  if (ARGV[1] !~ /^(s|t)$/) error("Wrong option.\n" usage) ;
  if (ARGV[1] == "s") tautomeric = 0 ;
  else tautomeric = 1 ;
  filecheck(pqr_file = ARGV[2]) ;

  make_definitions() ;
  detect_ssbonds() ;

  # Read .pqr file and write site info:
  naa = nNTR = nCTR = 0 ;
  rname = rnumb = "" ;
  while (getline < pqr_file)
  {
    # Ignore non-ATOM lines:
    if ($1 != "ATOM") continue ;
    # Check for new residue:
    if ($4 != rname || $5 != rnumb)
    {
      # Check for lacking atom in previous residue:
      if (n[rname] != "" && fullmatch == 0)
	warning("Residue "rname" "rnumb" lacks reference atom!") ;
      rname = $4 ;
      rnumb = $5 ;
      fullmatch = 0 ;
      if (rname ~ std_aa) naa++ ;
    }
    # Assign atom:
    atom = $3 ;
    # N-terminus site:
    # (considered if a standard residue is the first or follows a C-terminal
    #  site; only considered when reference atom is also matched)
    if (naa == 1 && atom == a["NTR"])
    {
      nNTR++ ;
      print_site("NTR", rname, rnumb) ;
    }
    # Detect suspect residues:
    # (only considered when reference atom is also matched)
    if (rname ~ suspect)
      if (atom == a[substr(rname,1,3)])
	warning("Suspect residue name: "rname" "rnumb".") ;
    # Non-terminus site:
    if (n[rname] != "" && atom == a[rname])
    {
      fullmatch = 1 ;
      if (rname != "CYS" || ssbond[rnumb] == 0)
	print_site(rname, rname, rnumb) ;
    }
    # Detect OXT atoms:
    if (atom == "OXT")
      warning("OXT atom found in residue "rname" "rnumb". " \
	      "Unprocessed C-terminal?") ;
    # C-terminus site (chosen by the *non-standard* atom name):
    if (atom == a["CTR"])
    {
      nCTR++ ;
      # Set residue counter to zero for new chain:
      naa = 0 ;
      print_site("CTR", rname, rnumb) ;
    }
  }
  close(pqr_file) ;

  if (nNTR == 0) warning("N-terminal not found! Check atom names, etc.") ;
  if (nCTR == 0) warning("C-terminal not found! Check atom names, etc.") ;
  if (nNTR != nCTR)
    warning("Numbers of N-termini ("nNTR") and C-termini ("nCTR \
	    ") do not match!") ;

}

function print_site(st,ri,rn)
{
  if (n[st] == 1)  # Non-tautomeric (avx) site
    printf "%4d %s%s%s\n", rn, prefix(st), ri, suffix1(st) ;
  else  # Tautomeric site
  {
    printf "%4d", rn ;
    for (i = 1 ; i <= n[st] ; i++) printf " %s%stau%d", prefix(st), ri, i ;
    printf "\n" ;
  }
}

# Prefix to be added to N- and C-terminal sites.
function prefix(s)
{
  if (s == "NTR") return "NT" ;
  else if (s == "CTR") return "CT" ;
  else return "" ;
}

# Suffix for single-tautomer case. Returns "avx" except for TYR and CYS.
function suffix1(s)
{
  if (s ~ /^(TYR|CYS)$/) return "all" ;
  else return "avx" ;
}

function detect_ssbonds(\
                        maxSGdist2, rn, x, y, z, i, j)
{
  maxSGdist2 = 2.2^2 ;
  nCYS = 0 ;
  while (getline < pqr_file)
  {
    # Ignore non-ATOM lines:
    if ($1 != "ATOM") continue ;
    if ($4 == "CYS" && $3 == "SG")
    {
      nCYS++ ;
      rn[nCYS] = $5 ;
      ssbond[nCYS] = 0 ;
      x[nCYS] = $6 ;
      y[nCYS] = $7 ;
      z[nCYS] = $8 ;
    }
  }
  close(pqr_file) ;

  for (i = 1 ; i <= nCYS-1 ; i++)
  for (j = i+1 ; j <= nCYS ; j++)
  {
    d2 = (x[i]-x[j])^2 + (y[i]-y[j])^2 + (z[i]-z[j])^2 ;
    if (d2 <= maxSGdist2)
    {
      ssbond[rn[i]] = ssbond[rn[j]] = 1 ;
      warning("CYS pair "rn[i]"-"rn[j]" considered bonded.") ;
    }
  }

  for (i = 1 ; i <= nCYS ; i++)
    if (ssbond[rn[i]] == 0) warning("CYS "rn[i]" considered titrable.") ;
}

function make_definitions()
{
  # Define standard aminoacid three-letter symbols:
  std_aa = "^(ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)" ;

  # "Suspect" residues (from ffG43a1, ffgmx, etc, but others may be added;
  # the first three letters should correspond to a "normal" titrable site)
  if (tautomeric)
    suspect = "^(ARGN|ASPH|CYSH|GLUH|HIS1|HISA|HISB|HISH|LYSH|TYRU)$" ;
  else
    suspect = "^(ARGN|ASPH|CYSH|GLUH|HIS1|HISH|LYSH|TYRU)$" ;

  # Specify number of tautomers for each site type:
  # (adding a site here also implies adding its reference atom below)
  if (tautomeric)
  {
    n["ARG"] = 1 ;
    n["HIS"] = n["TYR"] = 2 ;
    n["NTR"] = n["CYS"] = n["LYS"] = n["SER"] = n["THR"] = 3 ;
    n["CTR"] = n["ASP"] = n["GLU"] = 4 ;
    n["HOH"] = n["H2O"] = n["WAT"] = n["SOL"] = 6 ;
    n["ACE"] = 2 ;
    n["HEM"] = 1 ;
  }
  else
  {
    n["CTR"] = n["ASP"] = n["GLU"] = n["TYR"] = 1 ;
    n["HIS"] = n["HISA"] = n["HISB"] = 1 ;
    n["ARG"] = n["NTR"] = n["CYS"] = n["LYS"] = 1 ;
    n["ACE"] = 1 ;
    n["HEM"] = 1 ;
  }

  # Specify reference atoms (this ensures each site is counted just once):
  a["ASP"] = "CG" ;
  a["GLU"] = "CD" ;
  a["TYR"] = "OH" ;
  a["CYS"] = "SG" ;
  a["SER"] = "OG" ;
  a["THR"] = "OG1" ;
  a["ARG"] = "CZ" ;
  a["HIS"] = a["HISA"] = a["HISB"] = "CE1" ;
  a["LYS"] = "NZ" ;
  a["HOH"] = a["H2O"] = a["WAT"] = a["SOL"] = "OW" ;
  a["NTR"] = "N" ;
  a["CTR"] = "CT" ;
  a["ACE"] = "C1" ; # This atom name is more or less arbitrary...
  a["HEM"] = "FE" ;

  for (st in n)
    if (a[st] == "")
      error("No reference atom was defined for site of type "st".") ;
}

function filecheck(file)
{
  if (system("test -f "file))
    error("File "file" does not exist.") ;
  if (system("test -r "file))
    error("File "file" exists but is not readable.") ;
}

function warning(msg)
{
  print cmd ": Warning: " msg | "cat 1>&2" ;
  close ("cat 1>&2") ;
}

function error(msg)
{
  print cmd ": Error: " msg | "cat 1>&2" ;
  close ("cat 1>&2") ;
  exit 1 ;
}

