Skip to content

Instantly share code, notes, and snippets.

@danmcd
Created May 21, 2018 17:07
Show Gist options
  • Save danmcd/77cbcfff9e8a53d4b3c0c31a019862e9 to your computer and use it in GitHub Desktop.
Save danmcd/77cbcfff9e8a53d4b3c0c31a019862e9 to your computer and use it in GitHub Desktop.
diff --git a/overlay/generic/usr/lib/brand/jcommon/statechange b/overlay/generic/usr/lib/brand/jcommon/statechange
index 4f61a02..0d08300 100644
--- a/overlay/generic/usr/lib/brand/jcommon/statechange
+++ b/overlay/generic/usr/lib/brand/jcommon/statechange
@@ -264,9 +264,34 @@ setup_net()
# it already exist?
#
if [[ -n "$isoverlay" ]]; then
- if ! dladm show-overlay $global_nic 2>/dev/null; then
- dladm create-overlay $rule -v $num $global_nic
+ ntries=0
+ while [[ -d /var/run/ovcreate ||
+ ! dladm show-overlay $global_nic 2>/dev/null ]]; do
+ ln -s /proc/$$ /var/run/ovcreate
if [[ $? -ne 0 ]]; then
+ # Someone beat us to creating
+ # the lock file. Pause, and try
+ # again, but not too many times.
+ ntries = $(($ntries + 1))
+ if [[ $ntries -gt 5 ]]; then
+ logerr -p daemon.err \
+ "zone $ZONENAME cannot " \
+ "acquire ovcreate due to " \
+ "this: " \
+ `ls -l /var/run/ovcreate`
+ exit 1
+ fi
+ # Continue through the while loop
+ # after a pause.
+ sleep 1
+ continue
+ fi
+ dladm create-overlay $rule -v $num $global_nic
+ # Save the dladm return code so we can remove
+ # the lock immediately.
+ rc = $?
+ /bin/rm -f /var/run/ovcreate
+ if [[ $rc -ne 0 ]]; then
logger -p daemon.err "zone $ZONENAME " \
"failed to create overlay device " \
"$global_nic with command " \
@@ -274,7 +299,7 @@ setup_net()
"$num $global_nic"
exit 1
fi
- fi
+ done
fi
@mgerdts
Copy link

mgerdts commented May 21, 2018

lockdir=/var/run

function lock_enter {
        typeset lockname=$1
        typeset lock=$lockdir/$lockname
        typeset target=/proc/$$

        if [[ -z $lockname || $lockname == */* ]]; then
                print -u2 "ERROR: invalid lock '$lockname'"
                exit 1
        fi
        if [[ $lock -ef $target ]]; then
                print -u2 "ERROR: recursive lock by pid $$"
                exit 1
        fi

        while ! ln -s "$lock" "$target" >/dev/null 2>&1; do
                if [[ -d $lock ]]; then
                        # Process holding the lock still exists
                        sleep 0.1
                        continue
                fi

                # Lock recovery.  A little race here.  Only encountered if
                # a lock is abandoned
                rm -f "$lock"
        done
}

@danmcd
Copy link
Author

danmcd commented May 21, 2018

And lock_exit is simply rm -f $lockdir/$lockname, right?

@danmcd
Copy link
Author

danmcd commented May 21, 2018

Alternate diff using what @mgerdts said.

diff --git a/overlay/generic/usr/lib/brand/jcommon/statechange b/overlay/generic/usr/lib/brand/jcommon/statechange
index 4f61a02..2a5b956 100644
--- a/overlay/generic/usr/lib/brand/jcommon/statechange
+++ b/overlay/generic/usr/lib/brand/jcommon/statechange
@@ -75,6 +75,42 @@ DEFAULT_MTU=1500
 # o jst_mdatapath - The path the metadata socket is expected in the zone
 #
 
+lock_enter()
+{
+	typeset lockname=$1
+	typeset lock=/var/run/$lockname
+	typeset target=/proc/$$
+
+	if [[ -z $lockname || $lockname == */* ]]; then
+		print -u2 "ERROR: invalid lock '$lockname'"
+		exit 1
+	fi
+	if [[ $lock -ef $target ]]; then
+		print -u2 "ERROR: recursive lock by pid $$"
+		exit 1
+	fi
+
+	while ! ln -s "$lock" "$target" >/dev/null 2>&1; do
+		if [[ -d $lock ]]; then
+			# Process holding the lock still exists
+			sleep 0.1
+			continue
+		fi
+
+		# Lock recovery.  A little race here.  Only encountered if
+		# a lock is abandoned
+		rm -f "$lock"
+	done
+}
+
+lock_exit()
+{
+	typeset lockname=$1
+	typeset lock=/var/run/$lockname
+
+	rm -f "$lock"
+}
+
 get_boolean_nic_property()
 {
 	bool_val=$(eval echo \$_ZONECFG_net_${1}_${2})
@@ -265,6 +301,7 @@ setup_net()
 		#
 		if [[ -n "$isoverlay" ]]; then
 			if ! dladm show-overlay $global_nic 2>/dev/null; then
+				lock_enter ovlock
 				dladm create-overlay $rule -v $num $global_nic
 				if [[ $? -ne 0 ]]; then
 					logger -p daemon.err "zone $ZONENAME " \
@@ -272,8 +309,10 @@ setup_net()
 					     "$global_nic with command " \
 					     "'dladm create-overlay $rule -v " \
 					     "$num $global_nic"
+					lock_exit ovlock
 					exit 1
 				fi
+				lock_exit ovlock
 			fi
 		fi
 

@mgerdts
Copy link

mgerdts commented May 21, 2018

Heh. Worked something up before coming back to see your changes. I think mine is pretty much the same as what you have but with some more checks around lock_exit(). Also added a warning about lock recovery.

lockdir=/var/run

function lock_enter {
        typeset lockname=$1
        typeset lock=$lockdir/$lockname
        typeset target=/proc/$$

        if [[ -z $lockname || $lockname == */* ]]; then
                print -u2 "ERROR: invalid lock '$lockname'"
                exit 1
        fi
        if [[ $lock -ef $target ]]; then
                print -u2 "ERROR: recursive lock by pid $$"
                exit 1
        fi

        while ! ln -s "$target" "$lock" >/dev/null 2>&1; do
                if [[ -d $lock ]]; then
                        # Process holding the lock still exists
                        sleep 0.1
                        continue
                fi

                # Lock recovery.  A little race here.  Only encountered if
                # a lock is abandoned.
                typeset prev=$(ls -l "$lock" | nawk -F/ '{print $NF}')
                print -u2 "WARNING: recovering lock $lock (abandoned by $prev)"
                rm -f "$lock"
        done
}

function lock_exit {
        typeset lockname=$1
        typeset lock=$lockdir/$lockname
        typeset target=/proc/$$

        if [[ -z $lockname || $lockname == */* ]]; then
                print -u2 "ERROR: invalid lock '$lockname'"
                exit 1
        fi
        if ! [[ $lock -ef $target ]]; then
                print -u2 "ERROR: lock '$lockname' not held by pid $$"
                exit 1
        fi

        rm -f "$lock"
}

@danmcd
Copy link
Author

danmcd commented May 21, 2018

Okay... final answer (edited for locking fix):

diff --git a/overlay/generic/usr/lib/brand/jcommon/statechange b/overlay/generic/usr/lib/brand/jcommon/statechange
index 4f61a02..457f101 100644
--- a/overlay/generic/usr/lib/brand/jcommon/statechange
+++ b/overlay/generic/usr/lib/brand/jcommon/statechange
@@ -75,6 +75,53 @@ DEFAULT_MTU=1500
 # o jst_mdatapath - The path the metadata socket is expected in the zone
 #
 
+lockdir=/var/run
+function lock_enter {
+	typeset lockname=$1
+	typeset lock=$lockdir/$lockname
+	typeset target=/proc/$$
+
+	if [[ -z $lockname || $lockname == */* ]]; then
+		print -u2 "ERROR: invalid lock '$lockname'"
+		exit 1
+	fi
+	if [[ $lock -ef $target ]]; then
+		print -u2 "ERROR: recursive lock by pid $$"
+		exit 1
+	fi
+
+	while ! ln -s "$target" "$lock" >/dev/null 2>&1; do
+		if [[ -d $lock ]]; then
+			# Process holding the lock still exists
+			sleep 0.1
+			continue
+		fi
+
+		# Lock recovery.  A little race here.  Only encountered if
+		# a lock is abandoned.
+		typeset prev=$(ls -l "$lock" | nawk -F/ '{print $NF}')
+		print -u2 "WARNING: recovering lock $lock (abandoned by $prev)"
+		rm -f "$lock"
+	done
+}
+
+function lock_exit {
+	typeset lockname=$1
+	typeset lock=$lockdir/$lockname
+	typeset target=/proc/$$
+
+	if [[ -z $lockname || $lockname == */* ]]; then
+		print -u2 "ERROR: invalid lock '$lockname'"
+		exit 1
+	fi
+	if ! [[ $lock -ef $target ]]; then
+		print -u2 "ERROR: lock '$lockname' not held by pid $$"
+		exit 1
+	fi
+
+	rm -f "$lock"
+}
+
 get_boolean_nic_property()
 {
 	bool_val=$(eval echo \$_ZONECFG_net_${1}_${2})
@@ -265,6 +312,7 @@ setup_net()
 		#
 		if [[ -n "$isoverlay" ]]; then
+			lock_enter ovlock
 			if ! dladm show-overlay $global_nic 2>/dev/null; then
 				dladm create-overlay $rule -v $num $global_nic
 				if [[ $? -ne 0 ]]; then
 					logger -p daemon.err "zone $ZONENAME " \
@@ -272,8 +320,10 @@ setup_net()
 					     "$global_nic with command " \
 					     "'dladm create-overlay $rule -v " \
 					     "$num $global_nic"
+					lock_exit ovlock
 					exit 1
 				fi
 			fi
+			lock_exit ovlock
 		fi
 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment