Well, after I finished my first OCF agent back in October 2008, we have it running in production now for about ten months. During that time, we found quite a few points in which we’d like to improve the behaviour with that Linux-HA should handle TSM.
- Shutdown TSM nicely if possible (Cancel client sessions, cancel running processes and dismount mounted volumes)
- Better error handling
So, after another week of writing and testing with a small instance, I present the new OCF agent for Tivoli Storage Manager. It still has one or two weak points, but they are negligible. I still need to write the documentation for it, but the script should just work …
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
| #!/bin/sh
# Copyright 2009 christian.heim@barfoo.org
. ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
tsm_check() {
local exit_val=""
case "$1" in
validate) exit_val="$OCF_ERR_ARGS" ;;
*) exit_val="$OCF_NOT_RUNNING" ;;
esac
: ${OCF_RESKEY_dsmserv_dir:=/opt/tivoli/tsm/server/bin}
: ${OCF_RESKEY_dsmclient_dir:=/opt/tivoli/tsm/client/ba/bin}
: ${OCF_RESKEY_tsm_retries:=5}
: ${OCF_RESKEY_tsm_timeout:=5}
: ${OCF_RESKEY_tsm_logdir:=/var/log/tsm}
: ${OCF_RESKEY_tsm_gloves:=1}
case "$OCF_RESKEY_single_instance" in
0|false)
single_instance=0
if $TEST -z $OCF_RESKEY_instance_name -o -z $OCF_RESKEY_instance_prefix ; then
ocf_log err "TSM: You didn't specify an instance_name nor a prefix"
ocf_log err "TSM: yet you specified that this isn't a single instance!"
ocf_log err "TSM: Please check your configuration."
exit $exit_val
else
if $TEST ! -d $OCF_RESKEY_instance_prefix -o
! -d $OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name ; then
ocf_log err "TSM: Either the directory specified as instance_prefix"
ocf_log err "TSM: or as instance_name don't exist!"
ocf_log err "TSM: Please check your system."
exit $exit_val
else
instance_name=$OCF_RESKEY_instance_name
instance_prefix=$OCF_RESKEY_instance_prefix
instance_path=$OCF_RESKEY_instance_prefix/$OCF_RESKEY_instance_name
fi
fi
;;
1|true)
single_instance=1
instance_prefix=/opt/tivoli/tsm/server
instance_name=tsm
instance_path=$instance_prefix/$instance_name
if $TEST ! -d $instance_path ; then
ocf_log err "TSM: The directory specified as TSM directory doesn't exist!"
ocf_log err "TSM: Please check your system."
exit $exit_val
fi
;;
*)
ocf_log err "TSM: You didn't specify single_instance!"
exit $exit_val
;;
esac
if $TEST ! -d $OCF_RESKEY_dsmserv_dir -o ! -x $OCF_RESKEY_dsmserv_dir/dsmserv ; then
ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmserv_dir or the"
ocf_log err "TSM: dsmserv executeable doesn't exist!"
ocf_log err "TSM: Please check your system."
exit $exit_val
fi
if $TEST ! -d $OCF_RESKEY_dsmclient_dir -o ! -x $OCF_RESKEY_dsmclient_dir/dsmadmc ; then
ocf_log err "TSM: Either the specified directory $OCF_RESKEY_dsmclient_dir or the"
ocf_log err "TSM: dsmadmc executeable doesn't exist!"
ocf_log err "TSM: Please check your system."
exit $exit_val
fi
if $TEST ! -d $OCF_RESKEY_tsm_logdir ; then
ocf_log err "TSM: The logging dir specified ($OCF_RESKEY_tsm_logdir) doesn't exist!"
ocf_log err "TSM: Please check your system."
exit $exit_val
fi
if $TEST ! -f $instance_path/dsmserv.opt
-o ! -f $instance_path/dsmserv.dsk ; then
ocf_log err "TSM: Either $instance_path/dsmserv.opt"
ocf_log err "TSM: or $instance_path/dsmserv.dsk don't exist!"
ocf_log err "TSM: Please check your configuration."
exit $exit_val
fi
# We need to test the dsm.opt/dsm.sys for correct information, since it's
# needed for the graceful shutdown.
if $TEST $OCF_RESKEY_tsm_gloves -a -f $OCF_RESKEY_dsmclient_dir/dsm.sys -a
-f $OCF_RESKEY_dsmclient_dir/dsm.opt; then
if $TEST -z $OCF_RESKEY_cluster_dns -o -z $OCF_RESKEY_cluster_address -o
-z $OCF_RESKEY_cluster_port ; then
ocf_log err "TSM: You are missing a configuration value:"
ocf_log err "TSM: either cluster_dns, cluster_address or cluster_port isn't set!"
ocf_log err "TSM: Please recheck your configuration!"
exit $exit_val
fi
if $TEST "$( $EGREP "^ServerName.*$OCF_RESKEY_cluster_dns" $OCF_RESKEY_dsmclient_dir/dsm.sys )" = "" ; then
# We need to construct the dsm.sys
ocf_log err "TSM: You are lacking the proper entry for this TSM server."
ocf_log err "TSM: You need to check your $OCF_RESKEY_dsmclient_dir/dsm.sys"
ocf_log err "TSM: and set it up properly!"
ocf_log info "TSM: If in doubt, copy & paste this server stanza:"
ocf_log info "ServerName $OCF_RESKEY_cluster_dns"
ocf_log info "TCPServerAddress $OCF_RESKEY_cluster_address"
ocf_log info "TCPPORT $OCF_RESKEY_cluster_port"
ocf_log info "CommMethod TCPIP"
exit $exit_val
fi
fi
case "$OCF_RESKEY_tsm_gloves" in
0|1);;
true) OCF_RESKEY_tsm_gloves=1;;
false) OCF_RESKEY_tsm_gloves=0;;
*)
ocf_log err "You specified an invalid value for tsm_gloves."
ocf_log err "tsm_gloves should be either 0 or 1, or not set at all."
;;
esac
return $OCF_SUCCESS
}
tsm_pid() {
# Check whether or not the selected TSM instance is still running
if $TEST -f $instance_path/dsmserv.lock ; then
pid="$( $AWK --source '{ print $4 }' $instance_path/dsmserv.lock 2>/dev/null )"
kill -0 $pid &>/dev/null
case "$?" in
0)
# Process is up and running
export OCF_RETURNVAL_PID=$OCF_SUCCESS
export OCF_TSM_PID=$pid
return $OCF_SUCCESS
;;
1)
# Stale pid-file detected
export OCF_RETURNVAL_PID=$OCF_ERR_GENERIC
unset OCF_TSM_PID
return $OCF_ERR_GENERIC
;;
esac
else
# Process is not running
export OCF_RETURNVAL_PID=$OCF_NOT_RUNNING
unset OCF_TSM_PID
return $OCF_NOT_RUNNING
fi
}
tsm_monitor() {
tsm_check
tsm_pid
}
tsm_start() {
unset OCF_RETURNVAL_PID
unset OCF_TSM_PID
tsm_monitor
if $TEST $OCF_RETURNVAL_PID -eq 7 ; then
# Prepping the environment
export DSMSERV_DIR=$OCF_RESKEY_dsmserv_dir
export DSMSERV_CONFIG=$instance_path/dsmserv.opt
cd ${DSMSERV_CONFIG%/*}
$DSMSERV_DIR/dsmserv >> ${OCF_RESKEY_tsm_logdir}/$instance_name.log 2>&1 &
if $TEST $? -ne 0 ; then
ocf_log err "dsmserv failed to start up correctly and returned $?"
exit $OCF_ERR_GENERIC
fi
unset DSMSERV_CONFIG DSMSERV_DIR
ocf_log info "TSM: Started instance $instance_name."
fi
return $OCF_SUCCESS
}
tsm_stop() {
unset OCF_RETURNVAL_PID
unset OCF_TSM_PID
tsm_monitor
# In order to stop TSM there are two ways:
# o Gracefully shutting it down by stopping running sessions, disconnecting
# nodes and cancelling pending/running processes and issueing 'halt'
# o Simply killing the process with -9 (which is sometimes considered harmful
#
# If not explicitly wished, first try using the supplemented userid/password
# to shutdown the TSM instance.
if $TEST -n $OCF_RESKEY_tsm_user -a -n $OCF_RESKEY_tsm_password -a
-n $OCF_RESKEY_cluster_address -a -n $OCF_RESKEY_cluster_port -a
-n $OCF_RESKEY_cluster_dns -a $OCF_RESKEY_tsm_gloves -a $OCF_RETURNVAL_PID -eq 0 ; then
local cmd="$OCF_RESKEY_dsmclient_dir/dsmadmc -noconfirm -displaymode=list
-id=$OCF_RESKEY_tsm_user -password=$OCF_RESKEY_tsm_password
-server=$OCF_RESKEY_cluster_dns"
local logfile=${OCF_RESKEY_tsm_logdir}/dsmadmc.log
# dsmadmc is kinda limited, since it only write the logfile to the current PWD
cd $OCF_RESKEY_tsm_logdir
echo $( date ) 1>> dsmadmc.log 2>/dev/null
ocf_log info "TSM: Trying soft shutdown."
local i=1
while $TEST $i -le ${OCF_RESKEY_tsm_retries} ; do
process_list="$( $cmd query process | $EGREP 'Process Number: .*' | $AWK -F ': ' '{ print $2 }' )"
ocf_log debug "TSM(tsm_stop): ($i) Process list during shutdown: $process_list"
if $TEST -n $process_list ; then
for process in $process_list ; do
ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM process $process"
$cmd cancel process $process >> $logfile 2>&1
done
skip_process=0
else
skip_process=1
fi
session_list="$( $cmd query sessions | $EGREP 'Sess Number: .*' | $AWK -F ': ' '{ print $2 }' | sed "s/,//" )"
ocf_log debug "TSM(tsm_stop: ($i) Session list during shutdown: $session_list"
if $TEST -n $session_list ; then
for session in $session_list ; do
ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM session $session"
$cmd cancel session $session >> $logfile 2>&1
done
skip_session=0
else
skip_session=1
fi
mount_list="$( $cmd query mount | $EGREP 'LTO volume .* is mounted' | $AWK -F ' ' '{ print $4 }' )"
ocf_log debug "TSM(tsm_stop): ($i) Mount list during shutdown: $mount_list"
if $TEST -n $mount_list ; then
for mount in $mount_list ; do
ocf_log debug "TSM(tsm_stop): ($i) Cancelling TSM mount $mount"
$cmd dismount volume $mount >> $logfile 2>&1
done
skip_mount=0
else
skip_mount=1
fi
if $TEST $skip_process -a $skip_session -a $skip_mount ; then
ocf_log debug "TSM(tsm_stop): Skipping the remaining $((${OCF_RESKEY_tsm_retries}-$i)) tries, no activity in instance $instance_name (pid: $pid)"
break
fi
i=$(($i+1))
done
ocf_log info "TSM: Halting instance $instance_name (pid: $OCF_TSM_PID)"
ocf_log info "TSM: issuing $cmd halt"
$cmd halt >> $logfile 2>&1
local i=1
while $TEST $i -le $OCF_RESKEY_tsm_retries ; do
sleep $OCF_RESKEY_tsm_timeout
# Break out of the while, if tsm is stopped. Saves us some time
# (i*20 by default) when waiting for shutdown.
unset OCF_RETURNVAL_PID
unset OCF_TSM_PID
tsm_monitor
ocf_log info "TSM return value (290, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID"
if $TEST "$OCF_RETURNVAL_PID" -eq "$OCF_NOT_RUNNING" ; then
break
fi
i=$(($i+1))
done
unset OCF_RETURNVAL_PID
unset OCF_TSM_PID
tsm_monitor
ocf_log info "TSM return value (301, pid: $OCF_TSM_PID): $OCF_RETURNVAL_PID"
case "$OCF_RETURNVAL_PID" in
0)
ocf_log info "TSM(tsm_stop): Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) failed, thus continuing with not-so-graceful shutdown!"
success=1
;;
1)
ocf_log info "TSM: Graceful shutdown for instance $instance_name (pid: $OCF_TSM_PID) completed."
success=0
;;
esac
elif $TEST $OCF_RESKEY_tsm_gloves -eq 0 -a $OCF_RETURNVAL_PID -eq 0 ; then
success=1
else
success=0
fi
if $TEST "$success" -eq "1" ; then
ocf_log info "TSM: Trying not-so-graceful shutdown."
ocf_log debug "TSM(tsm_stop): issuing SIGTERM to instance $instance_name (pid: $OCF_TSM_PID)"
kill -TERM $OCF_TSM_PID 2>/dev/null
if $TEST $? -ne 0 ; then
ocf_log info "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGTERM."
ocf_log debug "TSM(tsm_stop): issuing SIGKILL to instance $instance_name (pid: $OCF_TSM_PID)"
kill -KILL $OCF_TSM_PID 2>/dev/null
if $TEST $? -ne 0 ; then
ocf_log err "TSM: Instance $instance_name (pid: $OCF_TSM_PID) failed to shutdown with SIGKILL."
ocf_log err "TSM: There's nothing we can do, so die gracefully."
ocf_log err "TSM: User interaction is required!"
return $OCF_ERR_GENERIC
fi
fi
ocf_log info "TSM: Successfully halted instance $instance_name (pid: $OCF_TSM_PID)"
fi
return $OCF_SUCCESS
}
tsm_metadata() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="TSM">
<version>1.0</version>
<longdesc lang="en">
This script manages a single/or multiple instances of Tivoli Storage Manager.
Please be aware, that in order to run your Tivoli Storage Manager server via
Heartbeat, you need to prepare each instance according to the Storage Manager
Installation handbook.
</longdesc>
<shortdesc lang="en">OCF Resource Agent compliant TSM script.</shortdesc>
<parameters>
<parameter name="single_instance" required="1" unique="0">
<longdesc lang="en">
Is your setup a single instance, or are you running multiple instances
</longdesc>
<shortdesc lang="en">Toggles changes for single/multiple instances</shortdesc>
<content type="boolean" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="90s" />
<action name="stop" timeout="100s" />
<action name="monitor" depth="10" timeout="30s" interval="60s" start-delay="300s" />
<action name="meta-data" timeout="5s" />
<action name="status" timeout="30s" />
</actions>
</resource-agent>
END
return $OCF_SUCCESS
}
case "$1" in
start) tsm_start;;
stop) tsm_stop;;
monitor) tsm_monitor;;
meta-data) tsm_metadata;;
validate-all) tsm_check validate;;
notify|demote|promote|migrate_to|migrate_from|reload|recover|*) exit $OCF_ERR_UNIMPLEMENTED;;
esac
# vim: set tabstop=2 shiftwidth=2 softtabstop=2 expandtab :
|