Well, after I finished my first OCF agent back in October 2008, we have it running in production now for about ten months. During that time, we found quite a few points in which we’d like to improve the behaviour with that Linux-HA should handle TSM.

  • Shutdown TSM nicely if possible (Cancel client sessions, cancel running processes and dismount mounted volumes)
  • Better error handling

So, after another week of writing and testing with a small instance, I present the new OCF agent for Tivoli Storage Manager. It still has one or two weak points, but they are negligible. I still need to write the documentation for it, but the script should just work …

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/bin/sh
# Copyright 2009 christian.heim@barfoo.org

. ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs

tsm_check() {
  local exit_val=""

  case "$1" in
    validate) exit_val="$OCF_ERR_ARGS" ;;
    *) exit_val="$OCF_NOT_RUNNING" ;;
  esac

  : ${OCF_RESKEY_dsmserv_dir:=/opt/tivoli/tsm/server/bin}
  : ${OCF_RESKEY_dsmclient_dir:=/opt/tivoli/tsm/client/ba/bin}
  : ${OCF_RESKEY_tsm_retries:=5}
  : ${OCF_RESKEY_tsm_timeout:=5}
  : ${OCF_RESKEY_tsm_logdir:=/var/log/tsm}
  : ${OCF_RESKEY_tsm_gloves:=1}

  case "$OCF_RESKEY_single_instance" in
    0|false)
      single_instance=0

      if $TEST -z $OCF_RESKEY_instance_name -o -z $OCF_RESKEY_instance_prefix ; then
        ocf_log err "TSM: You didn't specify an instance_name nor a prefix"
        ocf_log err "TSM: yet you specified that this isn't a single instance!"
        ocf_log err "TSM: Please check your configuration."
        exit $exit_val
      else
        if $TEST ! -d $OCF_RESKEY_instance_prefix -o
          ! -d $OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name ; then
          ocf_log err "TSM: Either the directory specified as instance_prefix"
          ocf_log err "TSM: or as instance_name don't exist!"
          ocf_log err "TSM: Please check your system."
          exit $exit_val
        else
          instance_name=$OCF_RESKEY_instance_name
          instance_prefix=$OCF_RESKEY_instance_prefix
          instance_path=$OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name
       fi
      fi

      ;;
    1|true)
      single_instance=1
      instance_prefix=/opt/tivoli/tsm/server
      instance_name=tsm
      instance_path=$instance_prefix/$instance_name

      if $TEST ! -d $instance_path ; then
        ocf_log err "TSM: The directory specified as TSM directory doesn't exist!"
        ocf_log err "TSM: Please check your system."
        exit $exit_val
      fi
      ;;
    *)
      ocf_log err "TSM: You didn't specify single_instance!"
      exit $exit_val
      ;;
  esac

  if $TEST ! -d $OCF_RESKEY_dsmserv_dir -o ! -x $OCF_RESKEY_dsmserv_dir/dsmserv ; then
    ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmserv_dir or the"
    ocf_log err "TSM: dsmserv executeable doesn't exist!"
    ocf_log err "TSM: Please check your system."
    exit $exit_val
  fi

  if $TEST ! -d $OCF_RESKEY_dsmclient_dir -o ! -x $OCF_RESKEY_dsmclient_dir/dsmadmc ; then
    ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmclient_dir or the"
    ocf_log err "TSM: dsmadmc executeable doesn't exist!"
    ocf_log err "TSM: Please check your system."
    exit $exit_val
  fi

  if $TEST ! -d $OCF_RESKEY_tsm_logdir ; then
    ocf_log err "TSM: The logging dir specified ($OCF_RESKEY_tsm_logdir) doesn't exist!"
    ocf_log err "TSM: Please check your system."
    exit $exit_val
  fi

  if $TEST ! -f $instance_path/dsmserv.opt
    -o ! -f $instance_path/dsmserv.dsk ; then
    ocf_log err "TSM: Either $instance_path/dsmserv.opt"
    ocf_log err "TSM: or $instance_path/dsmserv.dsk don't exist!"
    ocf_log err "TSM: Please check your configuration."
    exit $exit_val
  fi

  # We need to test the dsm.opt/dsm.sys for correct information, since it's
  # needed for the graceful shutdown.
  if $TEST $OCF_RESKEY_tsm_gloves -a -f $OCF_RESKEY_dsmclient_dir/dsm.sys -a
    -f $OCF_RESKEY_dsmclient_dir/dsm.opt; then

    if $TEST -z $OCF_RESKEY_cluster_dns -o -z $OCF_RESKEY_cluster_address -o
      -z $OCF_RESKEY_cluster_port ; then
      ocf_log err "TSM: You are missing a configuration value:"
      ocf_log err "TSM: either cluster_dns, cluster_address or cluster_port isn't set!"
      ocf_log err "TSM: Please recheck your configuration!"
      exit $exit_val
    fi

    if $TEST "$( $EGREP "^ServerName.*$OCF_RESKEY_cluster_dns" $OCF_RESKEY_dsmclient_dir/dsm.sys )" = "" ; then
      # We need to construct the dsm.sys
      ocf_log err "TSM: You are lacking the proper entry for this TSM server."
      ocf_log err "TSM: You need to check your $OCF_RESKEY_dsmclient_dir/dsm.sys"
      ocf_log err "TSM: and set it up properly!"
      ocf_log info "TSM: If in doubt, copy & paste this server stanza:"
      ocf_log info "ServerName       $OCF_RESKEY_cluster_dns"
      ocf_log info "TCPServerAddress $OCF_RESKEY_cluster_address"
      ocf_log info "TCPPORT          $OCF_RESKEY_cluster_port"
      ocf_log info "CommMethod       TCPIP"
      exit $exit_val
    fi
  fi

  case "$OCF_RESKEY_tsm_gloves" in
    0|1);;
    true) OCF_RESKEY_tsm_gloves=1;;
    false) OCF_RESKEY_tsm_gloves=0;;
    *)
      ocf_log err "You specified an invalid value for tsm_gloves."
      ocf_log err "tsm_gloves should be either 0 or 1, or not set at all."
      ;;
  esac

  return $OCF_SUCCESS
}

tsm_pid() {
  # Check whether or not the selected TSM instance is still running
  if $TEST -f $instance_path/dsmserv.lock ; then
    pid="$( $AWK --source '{ print $4 }' $instance_path/dsmserv.lock 2>/dev/null )"

    kill -0 $pid &>/dev/null

    case "$?" in
      0)
        # Process is up and running
        export OCF_RETURNVAL_PID=$OCF_SUCCESS
        export OCF_TSM_PID=$pid
        return $OCF_SUCCESS
        ;;
      1)
        # Stale pid-file detected
        export OCF_RETURNVAL_PID=$OCF_ERR_GENERIC
        unset OCF_TSM_PID
        return $OCF_ERR_GENERIC
        ;;
    esac

  else
    # Process is not running
    export OCF_RETURNVAL_PID=$OCF_NOT_RUNNING
    unset OCF_TSM_PID
    return $OCF_NOT_RUNNING
  fi
}

tsm_monitor() {
  tsm_check
  tsm_pid
}

tsm_start() {
  unset OCF_RETURNVAL_PID
  unset OCF_TSM_PID
  tsm_monitor

  if $TEST $OCF_RETURNVAL_PID -eq 7 ; then
    # Prepping the environment
    export DSMSERV_DIR=$OCF_RESKEY_dsmserv_dir
    export DSMSERV_CONFIG=$instance_path/dsmserv.opt

    cd ${DSMSERV_CONFIG%/*}
    $DSMSERV_DIR/dsmserv >> ${OCF_RESKEY_tsm_logdir}/$instance_name.log 2>&1 &

    if $TEST $? -ne 0 ; then
      ocf_log err "dsmserv failed to start up correctly and returned $?"
      exit $OCF_ERR_GENERIC
    fi

    unset DSMSERV_CONFIG DSMSERV_DIR

    ocf_log info "TSM: Started instance $instance_name."
  fi

  return $OCF_SUCCESS
}

tsm_stop() {
  unset OCF_RETURNVAL_PID
  unset OCF_TSM_PID
  tsm_monitor

  # In order to stop TSM there are two ways:
  #  o Gracefully shutting it down by stopping running sessions, disconnecting
  #     nodes and cancelling pending/running processes and issueing 'halt'
  #  o Simply killing the process with -9 (which is sometimes considered harmful
  #
  # If not explicitly wished, first try using the supplemented userid/password
  # to shutdown the TSM instance.

  if $TEST -n $OCF_RESKEY_tsm_user -a -n $OCF_RESKEY_tsm_password -a
    -n $OCF_RESKEY_cluster_address -a -n $OCF_RESKEY_cluster_port -a
    -n $OCF_RESKEY_cluster_dns -a $OCF_RESKEY_tsm_gloves -a $OCF_RETURNVAL_PID -eq 0 ; then

    local cmd="$OCF_RESKEY_dsmclient_dir/dsmadmc -noconfirm -displaymode=list
      -id=$OCF_RESKEY_tsm_user -password=$OCF_RESKEY_tsm_password
      -server=$OCF_RESKEY_cluster_dns"
    local logfile=${OCF_RESKEY_tsm_logdir}/dsmadmc.log

    # dsmadmc is kinda limited, since it only write the logfile to the current PWD
    cd $OCF_RESKEY_tsm_logdir

    echo $( date ) 1>> dsmadmc.log 2>/dev/null

    ocf_log info "TSM: Trying soft shutdown."
    local i=1
    while $TEST $i -le ${OCF_RESKEY_tsm_retries} ; do
      process_list="$( $cmd query process | $EGREP 'Process Number: .*' | $AWK -F ': ' '{ print $2 }' )"
      ocf_log debug "TSM(tsm_stop): ($i) Process list during shutdown: $process_list"

      if $TEST -n $process_list ; then
        for process in $process_list ; do
          ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM process $process"
          $cmd cancel process $process >> $logfile 2>&1
        done
        skip_process=0
      else
        skip_process=1
      fi

      session_list="$( $cmd query sessions | $EGREP 'Sess Number: .*' | $AWK -F ': ' '{ print $2 }' | sed "s/,//" )"
      ocf_log debug "TSM(tsm_stop: ($i) Session list during shutdown: $session_list"

      if $TEST -n $session_list ; then
        for session in $session_list ; do
          ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM session $session"
          $cmd cancel session $session >> $logfile 2>&1
        done
        skip_session=0
      else
        skip_session=1
      fi

      mount_list="$( $cmd query mount | $EGREP 'LTO volume .* is mounted' | $AWK -F ' ' '{ print $4 }' )"
      ocf_log debug "TSM(tsm_stop): ($i) Mount list during shutdown: $mount_list"

      if $TEST -n $mount_list ; then
        for mount in $mount_list ; do
          ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM mount $mount"
          $cmd dismount volume $mount >> $logfile 2>&1
        done
        skip_mount=0
      else
        skip_mount=1
      fi

      if $TEST $skip_process -a $skip_session -a $skip_mount ; then
        ocf_log debug "TSM(tsm_stop): Skipping the remaining $((${OCF_RESKEY_tsm_retries}-$i)) tries, no activity in instance $instance_name (pid: $pid)"
        break
      fi

      i=$(($i+1))
    done

    ocf_log info "TSM: Halting instance $instance_name (pid: $OCF_TSM_PID)"
    ocf_log info "TSM: issuing $cmd halt"
    $cmd halt >> $logfile 2>&1

    local i=1
    while $TEST $i -le $OCF_RESKEY_tsm_retries ; do
      sleep $OCF_RESKEY_tsm_timeout

      # Break out of the while, if tsm is stopped. Saves us some time
      # (i*20 by default) when waiting for shutdown.

      unset OCF_RETURNVAL_PID
      unset OCF_TSM_PID
      tsm_monitor

      ocf_log info "TSM return value (290, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID"

      if $TEST "$OCF_RETURNVAL_PID" -eq "$OCF_NOT_RUNNING" ; then
        break
      fi
      i=$(($i+1))
    done

    unset OCF_RETURNVAL_PID
    unset OCF_TSM_PID
    tsm_monitor

    ocf_log info "TSM return value (301, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID"

    case "$OCF_RETURNVAL_PID" in
      0)
        ocf_log info "TSM(tsm_stop): Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) failed, thus continuing with not-so-graceful shutdown!"
        success=1
        ;;
      1)
        ocf_log info "TSM: Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) completed."
        success=0
        ;;
    esac
  elif $TEST $OCF_RESKEY_tsm_gloves -eq 0 -a $OCF_RETURNVAL_PID -eq 0 ; then
    success=1
  else
    success=0
  fi

  if $TEST "$success" -eq "1" ; then
    ocf_log info "TSM: Trying not-so-graceful shutdown."
    ocf_log debug "TSM(tsm_stop): issuing SIGTERM to instance $instance_name (pid: $OCF_TSM_PID)"
    kill -TERM $OCF_TSM_PID 2>/dev/null

    if $TEST $? -ne 0 ; then
      ocf_log info "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGTERM."
      ocf_log debug "TSM(tsm_stop): issuing SIGKILL to instance $instance_name (pid: $OCF_TSM_PID)"
      kill -KILL $OCF_TSM_PID 2>/dev/null

      if $TEST $? -ne 0 ; then
        ocf_log err "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGKILL."
        ocf_log err "TSM: There's nothing we can do, so die gracefully."
        ocf_log err "TSM: User interaction is required!"
        return $OCF_ERR_GENERIC
      fi
    fi
    ocf_log info "TSM: Successfully halted instance $instance_name (pid: $OCF_TSM_PID)"
  fi

  return $OCF_SUCCESS

}

tsm_metadata() {
  cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="TSM">
<version>1.0</version>

<longdesc lang="en">
This script manages a single/or multiple instances of Tivoli Storage Manager.

Please be aware, that in order to run your Tivoli Storage Manager server via
Heartbeat, you need to prepare each instance according to the Storage Manager
Installation handbook.
</longdesc>

<shortdesc lang="en">OCF Resource Agent compliant TSM script.</shortdesc>

<parameters>

<parameter name="single_instance" required="1" unique="0">
<longdesc lang="en">
Is your setup a single instance, or are you running multiple instances
</longdesc>
<shortdesc lang="en">Toggles changes for single/multiple instances</shortdesc>
<content type="boolean" />
</parameter>

</parameters>

<actions>
<action name="start" timeout="90s" />
<action name="stop" timeout="100s" />
<action name="monitor" depth="10" timeout="30s" interval="60s" start-delay="300s" />
<action name="meta-data" timeout="5s" />
<action name="status" timeout="30s" />
</actions>

</resource-agent>
END
return $OCF_SUCCESS
}

case "$1" in
  start)     tsm_start;;
  stop)      tsm_stop;;
  monitor)   tsm_monitor;;
  meta-data) tsm_metadata;;
  validate-all) tsm_check validate;;
  notify|demote|promote|migrate_to|migrate_from|reload|recover|*) exit $OCF_ERR_UNIMPLEMENTED;;
esac

# vim: set tabstop=2 shiftwidth=2 softtabstop=2 expandtab :