
horms at verge
Oct 29, 2009, 5:10 PM
Post #2 of 7
(339 views)
Permalink
|
On Fri, Oct 30, 2009 at 09:04:45AM +1000, Luke Bigum wrote: > Here's a patch for the ldirectord resource agent. Makes the RA a lot more > OCF compliant and generally more robust, comments inline. Questions / > problems please ask. Hi Luke, this looks good to me. A few comments inline. > --- ldirectord.orij 2009-10-30 08:56:11.328658137 +1000 > +++ ldirectord 2009-10-30 08:55:44.857698098 +1000 > @@ -49,15 +49,23 @@ > . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs > > ####################################################################### > - > -# LDIRECTORD=/usr/sbin/ldirectord > -# LDIRCONF=/etc/ldirectord.cf > +# > +#iseek ldirectord OCF Resource Agent > +# > +#Modified version of the Heartbeat ldirectord to return better OCF > +#return codes. The changes makes the RA much more robust: we don't return > +#OCF_ERR_CONFIGURED for bad arguments any more which will stop the resource > +#going fatal cluster wide. We also trap the ldirectord script's normal > +#exit codes and return OCF compliant codes and make a better effort to > +#stop ldirectord. Could you move this changelog out of the code and into the text in your email accompanying the patch. That way I can put it into the mercurial commit message which seems to be the most appropriate place for it. > +# > +####################################################################### > > meta_data() { > cat <<END > <?xml version="1.0"?> > <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> > -<resource-agent name="Ldirectord" version="0.9"> > +<resource-agent name="ldirectord" version="0.9"> > <version>1.0</version> > > <longdesc lang="en"> > @@ -79,7 +87,7 @@ > > <parameter name="ldirectord"> > <longdesc lang="en"> > -The full pathname of the ldirectord. > +The full pathname of ldirectord. > </longdesc> > <shortdesc lang="en">ldirectord binary path</shortdesc> > <content type="string" default="/usr/sbin/ldirectord" /> > @@ -101,41 +109,18 @@ > ####################################################################### > > ldir_init() { > - > - COMMAND=$1 > - > - LDIRCONF=$OCF_RESKEY_configfile > - if [ x"${LDIRCONF}" = "x" ]; then > - LDIRCONF=/etc/ldirectord.cf > - fi > - > - if [ ! -f $LDIRCONF ]; then > - case $COMMAND in > - stop) ocf_log warn "$LDIRCONF not found. ldirectord considered stopped" > - exit $OCF_SUCCESS;; > - monitior) exit $OCF_NOT_RUNNING;; > - status) exit $LSB_STATUS_STOPPED;; > - start) ocf_log warn "$LDIRCONF not found." > - exit $OCF_NOT_RUNNING;; > - esac > - fi > - > - LDIRECTORD=$OCF_RESKEY_ldirectord > - if [ x"${LDIRECTORD}" = x ]; then > - LDIRECTORD="/usr/sbin/ldirectord" > - fi > - > - if [ ! -x $LDIRECTORD ]; then > - case $COMMAND in > - stop) ocf_log warn "$LDIRECTORD not found." > - exit $OCF_SUCCESS;; > - monitor) exit $OCF_NOT_RUNNING;; > - status) exit $LSB_STATUS_STOPPED;; > - start) ocf_log warn "$LDIRECTORD not found." > - exit $OCF_NOT_RUNNING;; > - esac > - fi > - > + #check the supplied parameters exist enough that we can do all the other operations Could you make this and other lines <= 80 characters wide? Also, I think a space after the # would make things easier to read here and elsewhere. > + LDIRCONF=${OCF_RESKEY_configfile:-/etc/ha.d/ldirectord.cf} > + if [ ! -f $LDIRCONF ]; then > + ocf_log warn "$LDIRCONF not found, ldirectord not installed" > + exit $OCF_ERR_INSTALLED > + fi > + > + LDIRECTORD=${OCF_RESKEY_ldirectord:-/usr/sbin/ldirectord} > + if [ ! -x $LDIRECTORD ]; then > + ocf_log warn "$LDIRECTORD not found, ldirectord not installed" > + exit $OCF_ERR_INSTALLED > + fi > } > > ldirectord_usage() { > @@ -151,58 +136,117 @@ > } > > ldirectord_start() { > + ldirectord_status > + RET=$? > + > + #if ldirectord is already running or there's an error, pass on this return code > + if [ $RET -ne $OCF_NOT_RUNNING ]; then > + return $RET > + fi > + The line above has trailing whitespace, please remove it. > + ocf_log info "Starting ldirectord" > $LDIRECTORD $LDIRCONF start > + RET=$? > + if [ $RET -ne 0 ]; then > + return $OCF_ERR_GENERIC > + fi > + > + #call status again to make sure we're running properly > + ldirectord_status > } > > ldirectord_stop() { > - $LDIRECTORD $LDIRCONF stop > + #a status check is simple enough to see if ldirectord is running or not. It will also > + #error out if there's configuration parsing errors so we can try kill ldirectord even if the > + #config is broken. > + ldirectord_status > + RET=$? > + > + #ldirectord may be running, hard to tell when status returns an error > + if [ $RET -eq $OCF_ERR_GENERIC ]; then > + #get the PID of the right ldirectord process > + PID=`pgrep -f "$LDIRECTORD $LDIRCONF start" 2>&1` > + RET=$? > + The line above has trailing whitespace, please remove it. > + if [ $RET -eq 0 ]; then > + ocf_log warn "Killing ldirectord($PID) with SIGTERM" > + kill $PID > + fi > + The line above has trailing whitespace, please remove it. > + pgrep -f "$LDIRECTORD $LDIRCONF start" >/dev/null 2>&1 > + RET=$? > + #if ldirectord is not running any more, we've (kind of) successfully stopped it > + if [ $RET -eq 1 ]; then > + return $OCF_SUCCESS > + else > + #ldirectord is still running? Kill it badly > + ocf_log warn "Killing ldirectord($PID) with SIGKILL" > + kill -9 $PID > + > + pgrep -f "$LDIRECTORD $LDIRCONF start" >/dev/null 2>&1 > + RET=$? > + #if it's not dead after here, we can't really do anything more > + if [ $RET -eq 1 ]; then > + return $OCF_SUCCESS > + fi > + fi > + The line above has trailing whitespace, please remove it. > + #if none of our kills work, return an error. This should force the resource unmanaged > + #on this node, requiring manual intervention. > + return $OCF_ERR_GENERIC > + else > + ocf_log info "Stopping ldirectord" > + #if ldirectord status is not an error, issue a stop. Multiple stops will return 0 > + $LDIRECTORD $LDIRCONF stop > + RET=$? > + case $RET in > + 0) return $OCF_SUCCESS;; > + *) return $OCF_ERR_GENERIC;; > + esac > + fi > } > > +#simple check to see if ldirectord is running, returns the proper OCF codes. > ldirectord_status() { > OUTPUT=`$LDIRECTORD $LDIRCONF status 2>&1` > case $? in > - 1) echo $OUTPUT > - return $OCF_ERR_GENERIC > - ;; > - 0) echo running > - return $OCF_SUCCESS > - ;; > - 3) echo stopped > - return $OCF_SUCCESS > - ;; > - *) echo $OUTPUT > - return $OCF_ERR_GENERIC > - ;; > + 0) return $OCF_SUCCESS;; > + #if there's a stale PID file, we can still attempt start ldirectord, but a status will still return 1, > + #so look for this error message. Every other error we shouldn't ignore Is this a property of the underlying ldirectord code that should be altered? > + 1) expr match "$OUTPUT" '.*ldirectord stale pid file.*' >/dev/null > + if [ $? -eq 0 ]; then > + return $OCF_NOT_RUNNING > + else > + return $OCF_ERR_GENERIC > + fi;; > + #if there's a config error, can't use ldirectord to do anything else (start, stop, etc) so return badly > + 2) ocf_log err "$LDIRCONF has configuration errors" > + echo $OUTPUT > + return $OCF_ERR_GENERIC;; > + 3) return $OCF_NOT_RUNNING;; > + #everything else is an error, haven't looked to hard into the ldirectord code yet, there may be more ldirectord should be returning error codes as set out for the old linux-ha ra spec. Anything else is probably a bug. > + #things we can try recover from > + *) echo $OUTPUT > + return $OCF_ERR_GENERIC;; > esac > } > > ldirectord_monitor() { > - OUTPUT=`$LDIRECTORD $LDIRCONF status 2>&1` > - case $? in > - 0) return $OCF_SUCCESS > - ;; > - 3) return $OCF_NOT_RUNNING > - ;; > - *) echo $OUTPUT > - return $OCF_ERR_GENERIC > - ;; > - esac > + #check if the process is running first > + ldirectord_status > + RET=$? > + > + if [ $RET -ne $OCF_SUCCESS ]; then > + return $RET > + fi > + > + #do more advanced checks here for high OCF_CHECK_LEVELs. Don't know what more we can do at this time, > + #a status call already hits LVS in the kernel. > } > > ldirectord_validate() { > - if [ ! -f $LDIRCONF ]; then > - ocf_log err $LDIRCONF > - ocf_log err "Configuration file $LDIRCONF not found!" > - exit $OCF_ERR_CONFIGURED > - fi > - > - if [ ! -x $LDIRECTORD ]; then > - ocf_log err "Binary file $LDIRECTORD not found." > - exit $OCF_ERR_CONFIGURED > - fi > - > -# exit $OC_ERR_UNIMPLEMENTED > - > + #ldir_init is already called, there's nothing more we can validate unless we add more attributes > + return $OCF_SUCCESS > } > > ldir_init > @@ -217,9 +261,6 @@ > stop) ldirectord_stop > ldirectord_exit $? > ;; > -status) ldirectord_status > - ldirectord_exit $? > - ;; > monitor) ldirectord_monitor > ldirectord_exit $? > ;; _______________________________________________________ Linux-HA-Dev: Linux-HA-Dev[at]lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/
|