#!/usr/bin/gawk -f
###########################################################################
# This file is part of ST-CpHMD, version v4.1_GMX4.07.
# 
# Copyright (c) 2005-2020, Instituto de Tecnologia Quimica e Biologica,
# Universidade Nova de Lisboa, Portugal.
# 
# ST-CpHMD is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 2 of the License, or (at your
# option) any later version.
# 
# ST-CpHMD is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with ST-CpHMD.  If not, see <http://www.gnu.org/licenses/>.
# 
# For further details and info check the manual.pdf file.
# 
# You can get ST-CpHMD at www.itqb.unl.pt/simulation
###########################################################################


############################################################################
# statepdb: a program to change the states of a .pdb trajectory file.
#
# This program reads a .pdb file with a structural trajectory (i.e.,
# with MODEL-separated structures) and writes to stdout a new .pdb
# file in which "absent" protons were either removed or assigned a zero
# occupancy.  The .occ file has the protonation states for the
# trajectory, with the Nth line containing the protonation vector to
# be assigned to the Nth MODEL in the .pdb file.  The program needs
# also a .sites file and the corresponding .st files.
#
# PAIRING STRUCTURES AND PROTONATIONS: Although protonation states and
# structures are assumed to be paired one-to-one, that pairing must be
# decided and done beforehand by the user.  As an example, consider a
# CpHMD simulation in which structures had been originally written
# every 1 ps, while protonation states had been written only every 2
# ps; so, the Nth protonation state in the original .occ file was the
# one used when running the MD segment that generated the (2N-1)th
# structure (written in the middle of that segment) and the 2Nth
# structure (written at the end of that segment).  Several pairing
# methods can be used, assigning the Nth protonation state to: (a) the
# "middle" structure of the Nth MD segment (Nth prot => (2N-1)th
# struct); (b) the "last" structure of the Nth MD segment (Nth prot =>
# 2Nth struct); (c) the "last" structure of the (N-1)th MD segment
# (Nth prot => (2N-2)th struct), from which the Nth protonation state
# was generated through PB/MC.  All alternatives are in principle
# physically reasonable in terms of the protein conformation, and the
# user is responsible for choosing one and provide the corresponding
# .pdb and .occ files.  However, the last alternative is not very
# realistic if water configuration is also intended, because it would
# assign a protonation state to a structure whose solvent is not yet
# relaxed (a solution would be to save also that protein conformation
# _after_ the solvent relaxation, but that would imply changing the
# main CpHMD script).
#
# This script was simply adapted from statepqr, so the strategy used
# is perhaps not optimal for what is intended; also, some now
# unnecessary variables may still be present.  The two programs could
# in principle be merged.
#
# Note that:
#
# - The numbering of states in the .occ file is assumed to start at 0,
#   as in PETIT.
#
# - The set of states for each site is assumed to consist of several
#   neutral states and a _final_ charged state (as presently used for
#   tautomers).
#
# - The criterion for an "absent" H atom is simply to have
#   |charge|<0.001 (see code).  This may lead to the undesired
#   exclusion of H atoms that normally have zero charge; but these do
#   not exist in the charge sets distributed with meadTools.
#
# - The treatment of non-titrable sites is not entirely satisfactory;
#   see below the comment on the definition of variable
#   "non_titrable".  Anyway, non-titrable sites are usually not
#   included in CpHMD, for which this program is mostly intended.
#
# Author: Antonio M. Baptista, baptista@itqb.unl.pt, 2017-06-11
# Last changed: AMB, 2017-06-11
#
############################################################################


BEGIN{

  if (ARGC != 5 && ARGC != 6) message("U", "Wrong number of arguments.") ;

  if (ARGV[1] ~ /^(r|k)$/) no_absent_H = (ARGV[1]=="r" ? 1 : 0) ;
  else message("U", "Wrong option for \"absent\" protons.") ;
  filecheck(pdb_file = ARGV[2]) ;
  filecheck(occ_file = ARGV[3]) ;
  filecheck(sites_file = ARGV[4]) ;
  if (ARGC == 6) st_dir = ARGV[5] ;
  else st_dir = "./" ;

  # This is used below to decide if a site is really titrable or not. The
  # present solution is not very good, because in the future we may want to
  # make some of the following sites titrable. This could in principle be
  # decided from the pKmod line of the .st files...
  non_titrable = /^(SER|THR|HOH|H2O)/ ;

  # Read sites file:
  nsites = 0 ;
  nNTR = nCTR = 0 ;
  while (getline < sites_file)
  {
    # The site designator s is atributed either "Ntr", "Ctr", or the
    # residue number.
    if ($2 ~ /^NT/)
    {
      Ntr[$1] = ++nNTR ;
      s = "Ntr" nNTR ;
    }
    else if ($2 ~ /^CT/)
    {
      Ctr[$1] = ++nCTR ;
      s = "Ctr" nCTR ;
    }
    else
    {
      s = $1 ;
    }
    # The site index starts at 1:
    nsites++ ;
    site[nsites] = s ;
    ntauts[s] = NF - 1 ;
    if ($2 ~ non_titrable) nstates[s] = ntauts[s] ;
    else nstates[s] = ntauts[s] + 1 ;
    for (t = 1 ; t <= ntauts[s] ; t++) st[s,t] = $(t+1) ;
  }
  close(sites_file) ;

  # Run over all states and compute proper charges from st files:
  for (s in ntauts)
  {
    for (t = 1 ; t <= ntauts[s] ; t++)
    {
      filecheck(st_file = st_dir "/" st[s,t] ".st") ;
      # First read to decide if site is anionic:
      getline < st_file ;       # read pKint value
      tot_p = tot_d = 0 ;
      while (getline < st_file)
      {
	tot_p += $3 ;
	tot_d += $4 ;
      }
      anionic[s] = (tot_p^2 < tot_d^2) ;
      close(st_file) ;
      getline < st_file ;
      while (getline < st_file)
	crg[s,t,$2] = anionic[s] * $3 + (1-anionic[s]) * $4 ;
      close(st_file) ;
    }
    # Any st file can be used for the charged state; the last is used.
    if (nstates[s] == ntauts[s] + 1)
    {
      getline < st_file ;
      while (getline < st_file)
	crg[s,nstates[s],$2] = anionic[s] * $4 + (1-anionic[s]) * $3 ;
      close(st_file) ;
    }
  }

  # Read the protonation states for MODEL:
  if (occ_file != "")
  {
    m = 0 ;
    while (getline < occ_file)
    {
      if (NF != nsites) message("E", "Wrong number of sites in "occ_file) ;
      occ[++m] = $0 ;
    }
    close(occ_file) ;
    nmod_occ = m ;
  }

  # Read pdb file to count MODELs:
  while (getline < pdb_file)
    if ($0 ~ /^MODEL/) nmod_pdb++;
  close(pdb_file) ;
  if (nmod_pdb != nmod_occ)
    message("E", "Different #MODELs: "nmod_occ" in OCC, "nmod_pdb" in PDB.") ;

  # Read pdb file, change occupancies and write again:
  m = 0 ;
  while (getline < pdb_file)
  {
    # Get info on states for this MODEL:
    if ($0 ~ /^MODEL/)
    {
      m++ ;     # better than using $2, to ensure matching with occ_file
      split(occ[m], xvector) ;
      for (i = 1 ; i <= nsites ; i++)
      {
	s = site[i] ;
	for (x = 1 ; x <= nstates[s] ; x++) frac[s,x] = 0 ;
	# The "+1" is needed because PETIT states start at 0:
	frac[s,xvector[i]+1] = 1 ;
      }
    }
    # Keep unchanged all lines not starting with ATOM:
    if ($0 !~ /^ATOM/)
    {
      print $0 ;
      continue ;
    }
    at = substr($0,13,4) ; gsub(/ /,"",at) ;
    if (at ~ /^[0-9]/) at = substr(at,2) substr(at,1,1) ;
    res = substr($0,23,4) ; gsub(/ /,"",res) ;
    s = "" ;
    if (Ntr[res] != 0) s = "Ntr" Ntr[res] ;
    if (Ctr[res] != 0) s = "Ctr" Ctr[res] ;
    # To deal with double-site residues (Ntr and Ctr), test if charge of
    # this atom has been assigned (eg, for state 1):
    if (crg[s,1,at] ~ /^$/) s = res ;
    # The same to exclude non-site residues, which are printed immediately:
    if (crg[s,1,at] ~ /^$/) print $0 ;
    # Stuff for actual sites:
    else
    {
      c = 0 ;
      for (x = 1 ; x <= nstates[s] ; x++) c += frac[s,x] * crg[s,x,at] ;
      # Detect "absent" H atoms (may need to be adjusted):
      if (at ~ /^H/ && c < 0.001 && c > -0.001) absent = 1 ;
      else absent = 0 ;
      # If that is intended, discard "absent" H atoms:
      if (no_absent_H && absent) continue ;
      printf("%s%6.2f%s\n", substr($0,1,54), 1-absent, substr($0,61)) ;
    }
  }

  exit 0 ;

}


function filecheck(file)
{
  if (system("test -f "file))
    message("E", "File "file" does not exist.") ;
  if (system("test -r "file))
    message("E", "File "file" exists but is not readable.") ;
}


function message(type, msg,
		 cmd, usage)
{
  cmd = "statepdb" ;
  usage = \
    "Usage: "cmd" r|k PDB_TRAJ OCC_TRAJ SITES_FILE [ST_DIR]\n" \
    "r|k = remove (r) or keep (k) \"absent\" protons.\n" \
    "Default ST_DIR is the current directory.\n" \
    "The new pdb trajectory is written to standard output.\n" \
    "See the script header for further details."

  if (type !~ /^[WEU]$/) message("E", "Wrong use of error function.") ;
  print cmd ": " msg (type=="U" ? "\n" usage : "") > "/dev/stderr" ;
  if (type != "W") exit 1 ;
}

