diff options
author | PMR <pmr@pmr-lander> | 2020-02-25 16:35:50 +0000 |
---|---|---|
committer | PMR <pmr@pmr-lander> | 2020-02-25 16:35:50 +0000 |
commit | 0c06eadd115b2cf015b8edcb756fe9236c151753 (patch) | |
tree | 991c7b64a43a8803434358237fa40ca4f09d80c8 | |
parent | b5986748a745f0c41231739d82e31b5fea03b885 (diff) | |
parent | 7f54a248b998a84b58cf26f4e53652766ff10f71 (diff) |
Merge #379215 from ~rodsmith/plainbox-provider-checkbox:unify-stress-ng-wrapper-scripts-2
Replace three stress-ng wrapper scripts with one unified script.
-rwxr-xr-x | bin/cpu_stress | 75 | ||||
-rwxr-xr-x | bin/disk_stress_ng | 330 | ||||
-rwxr-xr-x | bin/memory_stress_ng | 190 | ||||
-rwxr-xr-x | bin/stress_ng_test | 594 | ||||
-rw-r--r-- | units/disk/jobs.pxu | 4 | ||||
-rw-r--r-- | units/memory/jobs.pxu | 3 | ||||
-rw-r--r-- | units/stress/jobs.pxu | 4 |
7 files changed, 600 insertions, 600 deletions
diff --git a/bin/cpu_stress b/bin/cpu_stress deleted file mode 100755 index 56fccee..0000000 --- a/bin/cpu_stress +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/sh - -# Script to perform CPU stress tests -# -# Copyright (c) 2016 Canonical Ltd. -# -# Authors -# Rod Smith <rod.smith@canonical.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 3, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. -# -# The purpose of this script is to run CPU stress tests using the -# stress-ng program. -# -# Usage: -# cpu_stress [ --runtime <time-in-seconds> ] -# -# If --runtime is not specified, it defaults to 7200 (2 hours). - -runtime=7200 -if [ "$#" = "2" ] && [ "$1" = "--runtime" ] && [ "$2" -eq "$2" ] ; then - runtime=$2 -elif [ "$#" != "0" ] ; then - echo "Usage:" - echo " $0 [ --runtime <time-in-seconds> ]" - exit 1 -fi -echo "Setting run time to $runtime seconds" -# Add 10% to runtime; will forcefully terminate if stress-ng -# fails to return in that time. -end_time=$((runtime*11/10)) - -# NOTE: -# Options --af-alg 0 through --wcs 0 specify CPU stressors. As of stress-ng -# version 0.05.12, this is equivalent to --class cpu --all 0 --exclude numa,cpu_online. -# This script specifies stressors individually because the list of stressors keeps -# increasing, and we want consistency -- if the stress-ng version bumps up, we -# don't want new stressors being run. We're omitting numa because it's most -# useful on systems with massive numbers of CPUs, and cpu_online because it's -# failed on 4 of 8 test systems, so it seems too strict. -# Use "timeout" command to launch stress-ng, to catch it should it go into la-la land -timeout -s 9 $end_time stress-ng --aggressive --verify --timeout $runtime \ - --metrics-brief --tz --times \ - --af-alg 0 --bsearch 0 --context 0 --cpu 0 \ - --crypt 0 --hsearch 0 --longjmp 0 --lsearch 0 \ - --matrix 0 --qsort 0 --str 0 --stream 0 \ - --tsearch 0 --vecmath 0 --wcs 0 -result="$?" - -echo "**********************************************************" -if [ $result = "0" ] ; then - echo "* stress-ng CPU test passed!" -else - if [ $result = "137" ] ; then - echo "** stress-ng CPU test timed out and SIGKILL was used to " \ - "terminate the test (Error $result)!" - elif [ $return_code = "124" ] ; then - echo "* stress-ng CPU test timed out and was forcefully terminated " \ - "(Error $result)!" - else - echo "* stress-ng CPU test failed with result $result" - fi -fi -echo "**********************************************************" -exit $result diff --git a/bin/disk_stress_ng b/bin/disk_stress_ng deleted file mode 100755 index 9d8668e..0000000 --- a/bin/disk_stress_ng +++ /dev/null @@ -1,330 +0,0 @@ -#!/bin/bash - -# Script to disk stress tests using stress-ng -# -# Copyright (c) 2016 Canonical Ltd. -# -# Authors -# Rod Smith <rod.smith@canonical.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 3, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. -# -# The purpose of this script is to run disk stress tests using the -# stress-ng program. -# -# Usage: -# disk_stress_ng [ <device-filename> ] -# [ --base-time <time-in-seconds> ] -# [ --really-run ] -# -# Parameters: -# --disk-device -- This is the WHOLE-DISK device filename WITHOUT "/dev/" -# (e.g., sda). The script finds a filesystem on that -# device, mounts it if necessary, and runs the tests on -# that mounted filesystem. -# Test with iostat - -get_params() { - disk_device="/dev/sda" - short_device="sda" - base_time="240" - really_run="N" - while [ $# -gt 0 ] ; do - case $1 in - --base-time) base_time="$2" - shift - ;; - --really-run) really_run="Y" - ;; - *) disk_device="/dev/$1" - disk_device=`echo $disk_device | sed "s/\/dev\/\/dev/\/dev/g"` - short_device=$(echo $disk_device | sed "s/\/dev//g") - if [ ! -b $disk_device ] ; then - echo "Unknown block device \"$disk_device\"" - echo "Usage: $0 [ --base-time <time-in-seconds> ] [ --really-run ]" - echo " [ device-file ]" - exit 1 - fi - ;; - esac - shift - done - mounted_part="N" -} # get_params() - - -# Find the largest logical volume in an LVM partition. -# Output: -# $largest_part -- Device filename of largest qualifying partition -# $largest_size -- Size of largest qualifying partition -# $largest_fs -- Filesystem (ext4, etc.) used on largest qualifying partition -# Note: Above variables are initialized in find_largest_partition(), which -# calls this function. -# Caveat: If LVM is used, there can be no guarantee that a specific disk -# device is actually being tested. Thus, an LVM configuration should span -# just one disk device. LVM may be used on one disk, but subsequent disks -# should use "raw" partitions. -find_largest_lv() { - local partonly=$(echo $partition | cut -f 3 -d "/") - for syslv in $(ls -d /sys/block/dm-*/slaves/$partonly) ; do - lv=$(echo "$syslv" | cut -f 4 -d "/") - size=$(cat /sys/block/$lv/size) - sector_size=$(cat /sys/block/$lv/queue/hw_sector_size) - let size=$size*$sector_size - local blkid_info=$(blkid -s TYPE /dev/$lv | grep -E ext2\|ext3\|ext4\|xfs\|jfs\|btrfs) - if [ "$size" -gt "$largest_size" ] && [ -n "$blkid_info" ] ; then - local blkid_info=$(blkid -s TYPE /dev/$lv) - largest_size=$size - largest_part="/dev/$lv" - largest_fs=$(blkid -s TYPE "/dev/$lv" | cut -d "=" -f 2) - fi - done -} # find_largest_lv() - - -# Find the largest partition that holds a supported filesystem on $disk_device. -# Output: -# $largest_part -- Device filename of largest qualifying partition or logical volume -# $largest_size -- Size of largest qualifying partition or logical volume -# $largest_fs -- Filesystem (ext4, etc.) used on largest qualifying partition or logicl volume -# $unsupported_fs -- Empty or contains name of unsupported filesystem found on disk -find_largest_partition() { - largest_part="" - largest_size=0 - mapper_string="dm-" - if [ "${disk_device#*$mapper_string}" = "$disk_device" ]; then - partitions=$(lsblk -b -l -n -o NAME,SIZE,TYPE,MOUNTPOINT $disk_device | grep -E 'part|lvm|raid' | tr -s " ") - else - partitions=$(lsblk -b -l -n -o NAME,SIZE,TYPE,MOUNTPOINT $disk_device) - fi - unsupported_fs="" - for partition in $(echo "$partitions" | cut -d " " -f 1) ; do - if [ -b "/dev/$partition" ]; then - part_size=$(echo "$partitions" | grep "$partition " | cut -d " " -f 2) - part_location="/dev/$partition" - elif [ -b "/dev/mapper/$partition" ]; then - part_size=$(echo "$partitions" | grep "$partition " | cut -d " " -f 2) - part_location="/dev/mapper/$partition" - else - echo "$partition not found!" - echo "Aborting test" - exit 1 - fi - local blkid_info=$(blkid -s TYPE $part_location | grep -E ext2\|ext3\|ext4\|xfs\|jfs\|btrfs\|LVM2_member) - if [ "$part_size" -gt "$largest_size" ] && [ -n "$blkid_info" ] ; then - if [[ "$blkid_info" =~ .*LVM2_member.* ]] ; then - find_largest_lv - else - largest_size=$part_size - largest_part="$part_location" - largest_fs=$(blkid -s TYPE "$part_location" | cut -d "=" -f 2) - fi - fi - local blkid_info=$(blkid -s TYPE $part_location | grep -E ntfs\|vfat\|hfs) - if [ -n "$blkid_info" ] ; then - # If there's an NTFS, HFS+, or FAT filesystem on the disk make note of it.... - unsupported_fs=$(blkid -s TYPE "/dev/$partition" | cut -d "=" -f 2) - fi - done -} # find_largest_partition() - -# Find the largest filesystem on $disk_device. If that partition is not -# already mounted, try to mount it. -# Output: -# $test_dir -- Directory in which tests will occur -# $mount_point -- Location where filesystem is mounted -# $mounted_part -- Sets to "Y" if script mounted partition -# $made_mountpoint -- Sets to "Y" if script created the mount point -mount_filesystem() { - test_dir="/tmp/disk_stress_ng_$(uuidgen)" - if [ -b $disk_device ] - then - echo "$disk_device is a block device" - - #Add a check for warnings - WARN=$(parted -s ${disk_device} print | grep "^Warning.*${disk}.*[Rr]ead-only" 2>&1) - if [[ $? == 0 ]] - then - echo "Warning found in parted output:" - echo $WARN - echo "Aborting Test" - exit 1 - fi - else - echo "$disk_device is not a block device! Aborting!" - exit 1 - fi - - find_largest_partition - - if [ -n "$largest_part" ] ; then - echo "Found largest partition: \"$largest_part\"" - # If largest partition is too small, just abort with a message - if [ $largest_size -lt 10000000000 ] ; then - echo "Warning: $largest_part is less than 10GiB in size" - echo "Disk is too small to test. Aborting test!" - exit 1 - fi - mount_point=$(df | grep "$largest_part " | tr -s " " | cut -d " " -f 6) - if [ "$mount_point" == "" ] && [ "$really_run" == "Y" ] ; then - disk_device=$(echo $disk_device | sed "s/\/dev\/\/dev/\/dev/g") - mount_point="/mnt$short_device" - echo "No partition is mounted from $disk_device; attempting to mount one...." - if [ ! -d $mount_point ] ; then - mkdir -p "$mount_point" - made_mountpoint="Y" - fi - mount "$largest_part" "$mount_point" - mounted_part="Y" - fi - if [ "$mount_point" == "/" ] ; then - test_dir="/tmp/disk_stress_ng_$(uuidgen)" - else - test_dir="$mount_point/tmp/disk_stress_ng_$(uuidgen)" - fi - echo "Test will use $largest_part, mounted at \"$mount_point\", using $largest_fs" - else - echo "There appears to be no partition with a suitable filesystem" - echo "on $disk_device; please create a suitable partition and re-run" - echo "this test." - if [ -n "unsupported_fs" ] ; then - echo "NOTE: A filesystem of type $unsupported_fs was found, but is not supported" - echo "by this test. A Linux-native filesystem (ext2/3/4fs, XFS, JFS, or Btrfs)" - echo "is required." - fi - exit 1 - fi -} # mount_filesystem() - - -# Run an individual stressor -# Input: -# $1 = stressor name (e.g., copyfile, dentry) -# $2 = run time -# Output: -# had_error -- sets to "1" if an error occurred -run_stressor() { - local runtime="$2" - # Multiply runtime by 5; will forcefully terminate if stress-ng - # fails to return in that time. - end_time=$((runtime*5)) - echo "Running stress-ng $1 stressor for $2 seconds...." - # Use "timeout" command to launch stress-ng, to catch it should it go into - # la-la land - timeout -s 14 $end_time stress-ng --aggressive --verify --timeout $runtime \ - --temp-path $test_dir --$1 0 --hdd-opts dsync --readahead-bytes 16M -k - return_code="$?" - echo "return_code is $return_code" - if [ "$return_code" != "0" ] ; then - # - # a small grace period to allow stressors to terminate - # - sleep 10 - # - # still running? aggressively kill all stressors - # - pids=$(pidof stress-ng) - if [ -n "$pids" ]; then - kill -9 $pids - sleep 1 - kill -9 $pids - pids=$(pidof stress-ng) - if [ -n "$pids" ]; then - echo "Note: stress-ng (PIDS $pids) could not be killed" - fi - fi - had_error=1 - echo "*****************************************************************" - if [ $return_code = "124" ] ; then - echo "** stress-ng $stressor test timed out and was forcefully " \ - "terminated! (Error $return_code)" - elif [ $return_code = "137" ] ; then - echo "** stress-ng $stressor test timed out and SIGKILL was used to " \ - "terminate the test case! (Error $return_code)" - else - echo "** Error $return_code reported on stressor $stressor!)" - fi - echo "*****************************************************************" - had_error=1 - result=$return_code - fi -} # run_stressor() - - -# -# Main program body.... -# - - -get_params "$@" -mount_filesystem -echo "test_dir is $test_dir" - -had_error=0 - -# Tests Colin said to try but that aren't present as of standard stress-ng -# in Ubuntu 16.04: -# -# "chown" "copyfile" "ioprio" "locka" "lockofd" "madvise" "msync" "seal" -# -# TODO: Consider adding these tests for Ubuntu 18.04, or ealier with an -# updated stress-ng in the certification PPA.... - -disk_stressors=("aio" "aiol" "chdir" "chmod" "dentry" "dir" "fallocate" \ - "fiemap" "filename" "flock" "fstat" "hdd" "lease" "lockf" \ - "mknod" "readahead" "seek" "sync-file" "xattr") - -total_runtime=$((${#disk_stressors[@]}*$base_time)) - -# -# Ensure we have emnough async I/O events available, scale it -# based on number of CPUs on the machine -# -if [ -e /proc/sys/fs/aio-max-nr ] ; then - aiomax=$((8192 * $(nproc))) - aionow=$(cat /proc/sys/fs/aio-max-nr) - if [ $aiomax -gt $aionow ] ; then - echo $aiomax > /proc/sys/fs/aio-max-nr - echo "Setting aio-max-nr to $aiomax" - fi -fi - -echo "Estimated total run time is $total_runtime seconds" -echo "" - -if [ "$really_run" == "Y" ] ; then - mkdir -p "$test_dir" - for stressor in ${disk_stressors[@]}; do - run_stressor $stressor $base_time - done - rm -rf "$test_dir" - if [ "$mounted_part" == "Y" ] ; then - umount "$mount_point" - if [ "$made_mountpoint" == "Y" ] ; then - rmdir "$mount_point" - fi - fi -else - echo "To actually run tests, pass the --really-run option." - echo "Script is now terminating...." - exit 1 -fi - -echo "*******************************************************************" -if [ $had_error = "0" ] ; then - echo "** stress-ng disk test passed!" -else - echo "** stress-ng disk test failed; most recent error was $result" -fi -echo "*******************************************************************" -exit $result diff --git a/bin/memory_stress_ng b/bin/memory_stress_ng deleted file mode 100755 index e717f12..0000000 --- a/bin/memory_stress_ng +++ /dev/null @@ -1,190 +0,0 @@ -#!/bin/bash - -# Script to perform memory stress tests -# -# Copyright (c) 2016 Canonical Ltd. -# -# Authors -# Rod Smith <rod.smith@canonical.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 3, -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. -# -# The purpose of this script is to run memory stress tests using the -# stress-ng program. It also happens to impose a heavy CPU load, but -# that's a side effect of the memory stressors, not their purpose. -# -# Usage: -# memory_stress_ng [ --base-time <time> ] [ --time-per-gig <time> ] -# -# Parameters: -# --base-time is the time in seconds to run each stressor. (The default -# is 300 seconds, or five minutes.) -# --time-per-gig is extra time given to SOME stressors, measured in a -# seconds per GiB way. (The default is 10 seconds per GiB.) -# -# There are a total of 22 constant-run-time stressors and 6 variable- -# run-time stressors. Given the defaults, this works out to a total -# expected default run time of 8400 seconds (145 minutes) plus 60 seconds -# per GiB of RAM -- so a system with 16 GiB should take 156 minutes; one -# with 32 GiB should take 172 minutes, and so on, using the default -# values. - - -get_params() { - base_time=300 - time_per_gig=10 - while [ $# -gt 0 ] ; do - case $1 in - --base-time) base_time="$2" - shift - ;; - --time-per-gig) time_per_gig="$2" - shift - ;; - *) echo "Usage: $0 [ --base-time <time> ] [ --time-per-gig <time> ]" - exit 1 - ;; - esac - shift - done - local extra_time=$(($time_per_gig * $total_mem_in_GiB)) - variable_time=$(($base_time + $extra_time )) -} # get_params() - - -# Run an individual stressor -# Input: -# $1 = stressor name (e.g., malloc, brk) -# $2 = run time -# Output: -# had_error -- sets to "1" if an error occurred -run_stressor() { - local runtime="$2" - # Double runtime; will forcefully terminate if stress-ng - # fails to return in that time. - end_time=$((runtime*2)) - echo "Running stress-ng $1 stressor for $2 seconds...." - logger -t "memory_stress_ng" "Running stress-ng $1 stressor for $2 seconds..." - # Use "timeout" command to launch stress-ng, to catch it should it go into la-la land - timeout -s 14 $end_time stress-ng -k --aggressive --verify --timeout $runtime --$1 0 - return_code="$?" - echo "return_code is $return_code" - if [ "$return_code" != "0" ] ; then - # - # a small grace period to allow stressors to terminate - # - sleep 10 - # - # still running? aggressively kill all stressors - # - pids=$(pidof stress-ng) - if [ -n "$pids" ]; then - kill -9 $pids - sleep 1 - kill -9 $pids - pids=$(pidof stress-ng) - if [ -n "$pids" ]; then - echo "Note: stress-ng (PIDS $pids) could not be killed" - fi - fi - had_error=1 - echo "*****************************************************************" - if [ $return_code = "124" ] ; then - echo "** stress-ng $stressor timed out and was forcefully " - "terminated! (Error $return_code)" - elif [ $return_code = "137" ] ; then - echo "** stress-ng memory test timed out and SIGKILL was used to " \ - "terminate the test case! (Error $return_code)" - else - echo "** Error $return_code reported on stressor $stressor!)" - fi - echo "*****************************************************************" - had_error=1 - result=$return_code - fi -} # run_stressor() - - -# -# Main program body.... -# - -swap_space=`cat /proc/meminfo | grep -i SwapTotal | tr -s " " | cut -f 2 -d " "` -if [ -z $swap_space ] || [ $swap_space = "0" ] ; then - echo "Swap space unavailable! Please activate swap space and re-run this test!" - exit 1 -fi - -# Total memory in KiB.... -total_mem_in_KiB=`cat /proc/meminfo | grep MemTotal | tr -s " " | cut -f 2 -d " "` -total_mem_in_GiB=$((($total_mem_in_KiB/1048576)+1)) -echo "Total memory is $total_mem_in_GiB GiB" - -get_params "$@" -echo "Constant run time is $base_time seconds per stressor" -echo "Variable run time is $variable_time seconds per stressor" - -had_error=0 - -command -v numactl >/dev/null 2>&1 -if [ $? == 0 ] ; then - numa_nodes=$(numactl --hardware | grep available | head -n 1 | cut -f 2 -d " ") -else - numa_nodes=1 -fi - -# NOTE: Specify stressors in two arrays rather than rely on stress-ng's -# --class memory,vm option for two reasons: -# 1. We want to run some stressors (those that exhaust all memory) -# for longer than others, so we need to specify different run -# times for different stressors. -# 2. stress-ng is constantly being updated with new tests. We don't -# want to run one set of tests on SUT 1 and a larger set of tests -# on SUT 2 if we happen to have updated stress-ng for some unrelated -# reason (like a bug fix); thus, we specify tests individually. - -# Constant-run-time stressors -- run them for the same length of time on all -# systems.... -crt_stressors=("bsearch" "context" "hsearch" "lsearch" "matrix" \ - "memcpy" "null" "pipe" "qsort" "stack" "str" "stream" \ - "tsearch" "vm-rw" "wcs" "zero" "mlock" "mmapfork" "mmapmany" \ - "mremap" "shm-sysv" "vm-splice") -if [ "$numa_nodes" -gt 1 ]; then - crt_stressors+=("numa") -fi -crt_runtime=$((${#crt_stressors[@]}*$base_time)) - -# Variable-run-time stressors -- run them longer on systems with more RAM.... -vrt_stressors=("malloc" "mincore" "vm" "bigheap" "brk" "mmap") -vrt_runtime=$((${#vrt_stressors[@]}*$variable_time)) - -total_runtime=$((($crt_runtime + $vrt_runtime) / 60)) -echo "Estimated total run time is $total_runtime minutes" -echo "" - -for stressor in ${crt_stressors[@]}; do - run_stressor $stressor $base_time -done - -for stressor in ${vrt_stressors[@]}; do - run_stressor $stressor $variable_time -done - -echo "*******************************************************************" -if [ $had_error = "0" ] ; then - echo "** stress-ng memory test passed!" -else - echo "** stress-ng memory test failed; most recent error was $result" -fi -echo "*******************************************************************" -exit $result diff --git a/bin/stress_ng_test b/bin/stress_ng_test new file mode 100755 index 0000000..ced8455 --- /dev/null +++ b/bin/stress_ng_test @@ -0,0 +1,594 @@ +#!/usr/bin/env python3 +""" +Copyright (C) 2020 Canonical Ltd. + +Authors + Rod Smith <rod.smith@canonical.com> + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License version 3, +as published by the Free Software Foundation. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. + +The purpose of this script is to run CPU, memory, and disk stress +tests using the stress-ng binary program. It replaces the older +cpu_stress, memory_stress_ng, and disk_stress_ng Bash scripts. +""" + + +from argparse import ( + ArgumentParser, + RawTextHelpFormatter +) +from subprocess import ( + CalledProcessError, + PIPE, + Popen, + STDOUT, + TimeoutExpired +) +import os +import psutil +import shlex +import shutil +import signal +import stat +import sys +import time +import uuid + +# 10GiB (smallest acceptable size for disk tests): +min_fs_size = 10 * 1024 * 1024 * 1024 +# Swap filename +my_swap = None + + +class stress_ng(): + """Interfaces with the external stress-ng binary -- accepts + test parameters, runs the test, and enables access to test + results.""" + + def __init__(self, + stressors=['str'], + wrapper_timeout=25, + sng_timeout=20, + test_dir="/tmp", + extra_options=""): + + self.stressors = stressors + self.wrapper_timeout = wrapper_timeout + self.sng_timeout = sng_timeout + self.test_dir = test_dir + self.extra_options = extra_options + self.results = "" + self.returncode = 0 + + def run(self): + """Run a stress-ng test, storing results in self.results.""" + + stressor_list = "--" + " 0 --".join(self.stressors) + command = "stress-ng --aggressive --verify --timeout {} {} {} 0". \ + format(int(self.sng_timeout), + self.extra_options, + stressor_list) + time_str = time.strftime("%d %b %H:%M", time.gmtime()) + if len(self.stressors) == 1: + print("{}: Running stress-ng {} stressor for {:.0f} seconds...". + format(time_str, self.stressors[0], self.sng_timeout)) + else: + print("{}: Running multiple stress-ng ".format(time_str) + + "stressors in parallel for {:.0f}".format(self.sng_timeout)) + print("seconds...") + try: + run = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + local_results = run.communicate(timeout=self.wrapper_timeout)[0] + self.results = (local_results.decode(encoding="utf-8", + errors="ignore")) + self.returncode = run.returncode + if self.returncode != 0: + print("stress_ng exited with code {}".format(self.returncode)) + except CalledProcessError as err: + print("stress_ng exited with code {}".format(err.returncode)) + self.results = err.stdout + self.returncode = run.returncode + except TimeoutExpired: + print("stress_ng timed out!") + os.kill(run.pid, signal.SIGINT) + self.results = "" + # For consistency with old bash script & "timeout" wrapper... + self.returncode = 124 + except KeyboardInterrupt: + self.results = "" + self.returncode = 125 + return self.returncode + + def get_results(self): + return self.results + + def get_returncode(self): + return self.returncode + + +"""Define CPU-related functions...""" + + +def stress_cpu(args): + """Run stress-ng tests on CPUs.""" + + retval = 0 + stressors = ['bsearch', 'context', 'cpu', 'crypt', 'hsearch', 'longjmp', + 'lsearch', 'matrix', 'qsort', 'str', 'stream', 'tsearch', + 'vecmath', 'wcs'] + # Add 10% to runtime; will forcefully terminate if stress-ng + # fails to return in that time. + end_time = args.base_time * 11 / 10 + print("Estimated total run time is {:.0f} minutes\n". + format(args.base_time/60)) + + test_object = stress_ng(stressors=stressors, + sng_timeout=args.base_time, + wrapper_timeout=end_time, + extra_options="--metrics-brief --tz --times") + retval = test_object.run() + print(test_object.get_results()) + return retval + + +"""Define memory-related functions...""" + + +def num_numa_nodes(): + """Return the number of NUMA nodes supported by the CPU.""" + + if shutil.which("numactl") is None: + return 1 + else: + command = "numactl --hardware" + numactl = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + local_results = numactl.communicate()[0].split() + # local_results[1] will sometimes hold the number of NUMA nodes; + # but "numactl --hardware" sometimes returns the error message + # "No NUMA available on this system", so if this (or some other) + # error message appears, assume one NUMA node.... + try: + num_nodes = int(local_results[1]) + except ValueError: + num_nodes = 1 + return num_nodes + + +def swap_space_ok(args): + """Check available swap space. If too small, add more. The minimum + acceptable mount is defined as the GREATER of the amount specified + by the command-line -s/--swap-space option OR the amount specified + by the STRESS_NG_MIN_SWAP_SIZE environment variable. Both values are + specified in gibibytes (GiB). If neither is specified, a value of 0 + (no swap required) is assumed. + Returns: + - True if OK (already or after adding more) + - False if insufficient swap space""" + + retval = 0 + all_ok = True + global my_swap + min_swap_space = 0 + if "STRESS_NG_MIN_SWAP_SIZE" in os.environ: + min_swap_space = int(os.environ['STRESS_NG_MIN_SWAP_SIZE']) \ + * 1024 * 1024 * 1024 + if args.swap_size > min_swap_space: + min_swap_space = args.swap_size * 1024 * 1024 * 1024 + print("Minimum swap space is set to {:.0f} GiB". + format(min_swap_space / 1024 / 1024 / 1024)) + swap = psutil.swap_memory() + if swap.total < min_swap_space: + print("Swap space too small! Attempting to add more (this may take " + + "a while)....") + my_swap = "/swap-{}.img".format(uuid.uuid1()) + # Create swap file 10KiB bigger than minimum because there's a 4KiB + # overhead in the file, so if it were exactly the minimum, it would + # still be too small.... + try: + with open(my_swap, "w+b") as f: + # Swap file zeroed out and increased in size in 1KiB chunks to + # avoid problems with sparse files and creating temporary RAM + # use that potentially exceeds available RAM.... + for i in range(int((min_swap_space + 10240) / 1024)): + f.write(b"\x00" * 1024) + f.close() + except OSError: + print("Unable to create temporary swap file! Aborting test!") + f.close() + os.remove(my_swap) + all_ok = False + if all_ok: + os.chmod(my_swap, stat.S_IRUSR | stat.S_IWUSR) + cmd = "mkswap {}".format(my_swap) + Popen(shlex.split(cmd), stderr=STDOUT, stdout=PIPE).communicate()[0] + cmd = "swapon {}".format(my_swap) + Popen(shlex.split(cmd), stderr=STDOUT, stdout=PIPE).communicate()[0] + else: + retval = False + swap = psutil.swap_memory() + if swap.total < min_swap_space: + retval = False + else: + retval = True + return retval + + +def stress_memory(args): + """Run stress-ng tests on memory.""" + + retval = 0 + if not swap_space_ok(args): + return 130 + + ram = psutil.virtual_memory() + total_mem_in_gb = ram.total / 1073741824 + vrt = args.base_time + total_mem_in_gb * args.time_per_gig + print("Total memory is {:.1f} GiB".format(total_mem_in_gb)) + print("Constant run time is {} seconds per stressor". + format(args.base_time)) + print("Variable run time is {:.0f} seconds per stressor".format(vrt)) + print("Number of NUMA nodes is {}".format(num_numa_nodes())) + + # Constant-run-time stressors -- run them for the same length of time on + # all systems.... + crt_stressors = ['bsearch', 'context', 'hsearch', 'lsearch', 'matrix', + 'memcpy', 'null', 'pipe', 'qsort', 'stack', 'str', + 'stream', 'tsearch', 'vm-rw', 'wcs', 'zero', 'mlock', + 'mmapfork', 'mmapmany', 'mremap', 'shm-sysv', + 'vm-splice'] + if num_numa_nodes() > 1: + crt_stressors.append('numa') + + # Variable-run-time stressors -- run longer on systems with more RAM.... + vrt_stressors = ['malloc', 'mincore', 'vm', 'bigheap', 'brk', 'mmap'] + + est_runtime = len(crt_stressors) * args.base_time + \ + len(vrt_stressors) * vrt + print("Estimated total run time is {:.0f} minutes\n". + format(est_runtime/60)) + for stressor in crt_stressors: + test_object = stress_ng(stressors=stressor.split(), + sng_timeout=args.base_time, + wrapper_timeout=args.base_time*2) + retval = retval | test_object.run() + print(test_object.get_results()) + for stressor in vrt_stressors: + test_object = stress_ng(stressors=stressor.split(), sng_timeout=vrt, + wrapper_timeout=vrt*2) + retval = retval | test_object.run() + print(test_object.get_results()) + if my_swap is not None and args.keep_swap is False: + print("Deleting temporary swap file....") + cmd = "swapoff {}".format(my_swap) + Popen(shlex.split(cmd), stderr=STDOUT, stdout=PIPE).communicate()[0] + os.remove(my_swap) + return retval + + +"""Define disk-related functions...""" + + +def get_partition_data(file): + """Get partition details (size & type) on /dev/{file} & return in + dictionary.""" + + part_data = {} + part_data['name'] = file + + # Get size of device, in bytes.... + command = "blockdev --getsize64 /dev/{}".format(file) + run = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + part_data['size'] = int(run.communicate()[0]) + + # Get filesystem type.... + part_data['fs_type'] = "" + command = "blkid /dev/{} -o export".format(file) + run = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + local_results = run.communicate()[0].split() + for result in local_results: + result_str = result.decode(encoding="utf-8", errors="ignore") + if "TYPE" in result_str: + part_data['fs_type'] = result_str.split("=")[1] + return part_data + + +def find_mount_point(file): + """Find the mount point of /dev/{file}. + Returns: + * None if unmounted + * The mount point (as a string) if it's mounted.""" + + mount_point = None + command = "df /dev/{} --output=target".format(file) + run = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + output = run.communicate()[0].decode(encoding="utf-8", errors="ignore"). \ + split() + potential_mount_point = str(output[-1]) + # If df is fed a non-mounted-partition, it returns "/dev" as the + # mount point, so ignore that.... + if potential_mount_point != "/dev": + mount_point = potential_mount_point + return mount_point + + +class disk(): + """Interfaces to disk device, to check device status, find largest + partition, mount it, etc.""" + + def __init__(self, device=""): + self.device = device + self.all_parts = [] + self.unsupported_fs = None + self.test_dir = "/tmp" + lvm_detected = False + # Find final element of device name; for instance "sda" for "/dev/sda" + stripped_devname = self.device.split("/")[-1] + + # Do first pass to collect data on partitions & software RAID + # devices (which we treat like partitions).... + for file in os.listdir("/sys/class/block"): + if stripped_devname in file: + part_data = get_partition_data(file) + part_data['part_type'] = "partition" + if part_data['fs_type'] == "LVM2_member": + lvm_detected = True + self.all_parts.append(part_data) + + # Do another pass to collect data on logical volumes, if any exist + # on the target device.... + # NOTE: This code ignores where an LVM exists; it could span multiple + # disks, or be on one other than the one being tested. Canonical + # certification specifies use of partitions, not LVMs, so this code + # exists mainly for software development using development systems, + # not on servers actually being tested. + if lvm_detected: + for file in os.listdir("/sys/class/block/"): + if "dm-" in file: + part_data = get_partition_data(file) + part_data['part_type'] = "lv" + self.all_parts.append(part_data) + + def is_block_device(self): + try: + mode = os.stat(self.device).st_mode + if not stat.S_ISBLK(mode): + print("{} is NOT a block device! Aborting!". + format(self.device)) + return False + except FileNotFoundError: + print("{} does not exist! Aborting!".format(self.device)) + return False + return True + + def find_largest_partition(self): + """Find the largest partition that holds a supported filesystem on + self.device. Sets: + self.largest_part -- Dictionary containing information on largest + partition + self.unsupported_fs -- Empty or contains information about largest + unsupported filesystem (of certain known types) + found on disk""" + + self.largest_part = {'name': "", + 'size': 0, + 'part_type': "lv", + 'fs_type': ""} + self.unsupported_fs = None + + # A filesystem can be supported for the test; unsupported but worth + # noting in an error message; or unsupported and not worth noting. + # The first two categories are enumerated in lists.... + supported_filesystems = ['ext2', 'ext3', 'ext4', 'xfs', 'jfs', 'btrfs'] + unsupported_filesystems = ['ntfs', 'vfat', 'hfs', 'LVM2_member'] + + for part in self.all_parts: + new_sz = int(part['size']) + old_sz = int(self.largest_part['size']) + new_lv = part['part_type'] == "lv" + old_lv = self.largest_part['part_type'] == "lv" + if (new_sz > 0 and old_sz == 0) or \ + (new_sz > min_fs_size and old_sz < min_fs_size) or \ + (new_sz > min_fs_size and new_sz > old_sz and old_lv) or \ + (new_sz > old_sz and not new_lv): + if part['fs_type'] in supported_filesystems: + self.largest_part = part + elif part['fs_type'] in unsupported_filesystems: + # Make note of it if it might be an old filesystem + # that was not properly re-allocated.... + self.unsupported_fs = part + return self.largest_part + + def mount_filesystem(self, simulate): + print("Disk device is {}".format(self.device)) + target_part = self.find_largest_partition() + if target_part['name'] == "": + if self.unsupported_fs is not None: + print("A filesystem of type {} was found, but is not " + "supported by this test.". + format(self.unsupported_fs['fs_type'])) + print("A Linux-native filesystem (ext2/3/4fs, XFS, JFS, or " + "Btrfs) is required.") + else: + print("No suitable partition found!") + return False + + if target_part['size'] < min_fs_size: + print("Warning: {} is less than {:.0f} GiB in size!". + format(target_part['name'], min_fs_size/1024/1024/1024)) + print("Disk is too small to test. Aborting test!") + return False + + full_device = "/dev/{}".format(target_part['name']) + print("Testing partition {}".format(full_device)) + mount_point = find_mount_point(target_part['name']) + if simulate: + print("Run with --simulate, so not mounting filesystems.") + print("If run without --simulate, would mount {} to {}". + format(full_device, mount_point)) + print("(if not already mounted).") + else: + if not mount_point: + mount_point = "/mnt/{}".format(target_part['name']) + print("Trying to mount {} to {}...". + format(full_device, mount_point)) + os.makedirs(mount_point, exist_ok=True) + command = "mount {} {}".format(full_device, mount_point) + run = Popen(shlex.split(command), stderr=STDOUT, stdout=PIPE) + output = run.communicate()[0].decode(encoding="utf-8", + errors="ignore") + print(output) + else: + print("{} is already mounted at {}". + format(full_device, mount_point)) + self.test_dir = "{}/tmp/stress-ng-{}".format(mount_point, + uuid.uuid1()) + os.makedirs(self.test_dir, exist_ok=True) + return True + + +def stress_disk(args): + """Run stress-ng tests on disk.""" + + disk_stressors = ['aio', 'aiol', 'chdir', 'chmod', 'chown', 'dentry', + 'dir', 'fallocate', 'fiemap', 'filename', 'flock', + 'fstat', 'hdd', 'ioprio', 'lease', 'locka', 'lockf', + 'lockofd', 'madvise', 'mknod', 'msync', 'readahead', + 'seal', 'seek', 'sync-file', 'xattr'] + + retval = 0 + if "/dev" not in args.device and args.device != "": + args.device = "/dev/" + args.device + + test_disk = disk(args.device) + if not test_disk.is_block_device(): + return 131 + if test_disk.mount_filesystem(args.simulate): + est_runtime = len(disk_stressors) * args.base_time + print("Using test directory: '{}'".format(test_disk.test_dir)) + print("Estimated total run time is {:.0f} minutes\n". + format(est_runtime/60)) + retval = 0 + if not args.simulate: + for stressor in disk_stressors: + disk_options = "--temp-path {} ".format(test_disk.test_dir) + \ + "--hdd-opts dsync --readahead-bytes 16M -k" + test_object = stress_ng(stressors=stressor.split(), + sng_timeout=args.base_time, + wrapper_timeout=args.base_time*5, + extra_options=disk_options) + retval = retval | test_object.run() + print(test_object.get_results()) + if test_disk.test_dir != "/tmp" and not args.simulate: + shutil.rmtree(test_disk.test_dir, ignore_errors=True) + else: + retval = 132 + + return retval + + +"""Main program body...""" + + +def main(): + """Run a stress_ng-based stress run.""" + + parser = ArgumentParser( + description="Run tests based on stress-ng", + formatter_class=RawTextHelpFormatter) + subparsers = parser.add_subparsers() + + # Main cli options + cpu_parser = subparsers.add_parser('cpu', help=("Run CPU tests")) + memory_parser = subparsers.add_parser('memory', help=("Run memory tests")) + disk_parser = subparsers.add_parser('disk', help=("Run disk tests")) + + # Sub test options + # action = test_parser.add_mutually_exclusive_group() + + # CPU parameters + cpu_parser.add_argument("-b", "--base-time", type=int, default=7200, + help="Run time, in seconds (default=7200)") + + # Memory parameters + memory_parser.add_argument("-b", "--base-time", type=int, + help="Base time for each test, in seconds " + + "(default=300)", default=300) + memory_parser.add_argument("-t", "--time-per-gig", type=int, + help="Extra time per GiB for some stressors " + + "(default=10)", default=10) + memory_parser.add_argument("-s", "--swap-size", type=int, + help="swap size in GiB", default=0) + memory_parser.add_argument("-k", "--keep-swap", action="store_true", + help="Keep swap file, if added by test") + + # Disk parameters + disk_parser.add_argument("-d", "--device", type=str, required=True, + help="Disk device (/dev/sda, etc.)") + disk_parser.add_argument("-b", "--base-time", type=int, + help="Time for each test, in seconds " + + "(default=240)", default=240) + disk_parser.add_argument("-s", "--simulate", action="store_true", + help="Report disk info, but don't run tests") + + cpu_parser.set_defaults(func=stress_cpu) + memory_parser.set_defaults(func=stress_memory) + disk_parser.set_defaults(func=stress_disk) + + args = parser.parse_args() + + # logging.basicConfig(level=logging.INFO) + + if shutil.which("stress-ng") is None: + print("The stress-ng utility is not installed; exiting!") + return(128) + if not os.geteuid() == 0: + print("This program must be run as root (or via sudo); exiting!") + return(129) + + retval = 1 + if 'func' not in args: + parser.print_help() + else: + retval = args.func(args) + print("**************************************************************") + if retval == 0: + print("* stress-ng test passed!") + elif retval == 124: # Terminated by Python timeout + print("** stress-ng test timed out and was forcefully ") + print(" terminated (Error {})".format(retval)) + elif retval == 125: # Terminated by SIGINT + print("** stress-ng test timed out and SIGINT (Ctrl+C) " + + "was used to terminate") + print(" the test (Error {})!".format(retval)) + elif retval == 130: # Insufficient swap space for memory test + print("** Swap space unavailable! Please activate swap space " + + "and re-run this test!") + print(" (Error {})".format(retval)) + elif retval == 131: # Alleged disk device is not a device file + print("** {} is not a block device! Aborting!".format(args.device)) + print(" (Error {})".format(retval)) + elif retval == 132: # Unable to find a partition for disk test + print("** Unable to find a suitable partition! Aborting!") + print(" (Error {})".format(retval)) + elif retval == 137: # Terminated by SIGKILL + print("** stress-ng test timed out and SIGKILL was used to ") + print(" terminate the test (Error {})!".format(retval)) + else: + print("stress-ng test failed with return code: {}".format(retval)) + print("**************************************************************") + + return(retval) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/units/disk/jobs.pxu b/units/disk/jobs.pxu index 3e0f568..431cb2e 100644 --- a/units/disk/jobs.pxu +++ b/units/disk/jobs.pxu @@ -94,10 +94,10 @@ command: if [ -n "$STRESS_NG_DISK_TIME" ] then echo "Found STRESS_NG_DISK_TIME env var, stress_ng disk running time is now: $STRESS_NG_DISK_TIME seconds" - disk_stress_ng {name} --base-time $STRESS_NG_DISK_TIME --really-run + stress_ng_test disk --device {name} --base-time $STRESS_NG_DISK_TIME else echo "STRESS_NG_DISK_TIME env var is not found, stress_ng disk running time is default value" - disk_stress_ng {name} --base-time 240 --really-run + stress_ng_test disk --device {name} --base-time 240 fi unit: template diff --git a/units/memory/jobs.pxu b/units/memory/jobs.pxu index 48d543c..5fad4f4 100644 --- a/units/memory/jobs.pxu +++ b/units/memory/jobs.pxu @@ -44,9 +44,10 @@ category_id: com.canonical.plainbox::memory id: memory/memory_stress_ng estimated_duration: 11000.0 user: root +environ: STRESS_NG_MIN_SWAP_SIZE requires: package.name == 'stress-ng' or executable.name == 'stress-ng' -command: memory_stress_ng +command: stress_ng_test memory _summary: Stress test of system memory _description: Test to perform some basic stress and exercise of system memory via the diff --git a/units/stress/jobs.pxu b/units/stress/jobs.pxu index cf44c6c..0a1ce27 100644 --- a/units/stress/jobs.pxu +++ b/units/stress/jobs.pxu @@ -22,10 +22,10 @@ command: if [ -n "$STRESS_NG_CPU_TIME" ] then echo "Found STRESS_NG_CPU_TIME env var, stress_ng cpu running time is now: $STRESS_NG_CPU_TIME seconds" - cpu_stress --runtime $STRESS_NG_CPU_TIME + stress_ng_test cpu --base-time $STRESS_NG_CPU_TIME else echo STRESS_NG_CPU_TIME env var is not found, stress_ng cpu running time is default value - cpu_stress --runtime 7200 + stress_ng_test cpu --base-time 7200 fi _summary: Stress of CPUs (very long runtime) |