Updating runtest.py to support printing coredump information (dotnet#…

…21149) * Updating runtest.py to support printing coredump information * Adding some comments to the coredump collection methods in runtest.py
EgorBo · Nov 26, 2018 · 63cde8f · 63cde8f
1 parent 570bda3
commit 63cde8f
Show file tree

Hide file tree

Showing 2 changed files with 224 additions and 2 deletions.
diff --git a/tests/runtest.py b/tests/runtest.py
@@ -41,6 +41,7 @@
 
 import argparse
 import datetime
+import fnmatch
 import json
 import math
 import os
@@ -126,6 +127,7 @@
 parser.add_argument("--generate_layout_only", dest="generate_layout_only", action="store_true", default=False)
 parser.add_argument("--analyze_results_only", dest="analyze_results_only", action="store_true", default=False)
 parser.add_argument("--verbose", dest="verbose", action="store_true", default=False)
+parser.add_argument("--limited_core_dumps", dest="limited_core_dumps", action="store_true", default=False)
 
 # Only used on Unix
 parser.add_argument("-test_native_bin_location", dest="test_native_bin_location", nargs='?', default=None)
@@ -137,6 +139,7 @@
 g_verbose = False
 gc_stress_c = False
 gc_stress = False
+coredump_pattern = ""
 file_name_cache = defaultdict(lambda: None)
 
 ################################################################################
@@ -548,11 +551,13 @@ def get_environment(test_env=None):
 
 def call_msbuild(coreclr_repo_location,
                  dotnetcli_location,
+                 test_location,
                  host_os,
                  arch,
                  build_type, 
                  is_illink=False,
-                 sequential=False):
+                 sequential=False,
+                 limited_core_dumps=False):
     """ Call msbuild to run the tests built.
 
     Args:
@@ -591,6 +596,9 @@ def call_msbuild(coreclr_repo_location,
     if is_illink:
         command += ["/p:RunTestsViaIllink=true"]
 
+    if limited_core_dumps:
+        command += ["/p:LimitedCoreDumps=true"]
+
     log_path = os.path.join(logs_dir, "TestRunResults_%s_%s_%s" % (host_os, arch, build_type))
     build_log = log_path + ".log"
     wrn_log = log_path + ".wrn"
@@ -622,6 +630,9 @@ def call_msbuild(coreclr_repo_location,
         proc.kill()
         sys.exit(1)
 
+    if limited_core_dumps:
+        inspect_and_delete_coredump_files(host_os, arch, test_location)
+
     return proc.returncode
 
 def running_in_ci():
@@ -705,6 +716,207 @@ def correct_line_endings(host_os, test_location, root=True):
             with open(test_location, 'w') as file_handle:
                 file_handle.write(content)
 
+def setup_coredump_generation(host_os):
+    """ Configures the environment so that the current process and any child
+        processes can generate coredumps.
+
+    Args:
+        host_os (String)        : os
+
+    Notes:
+        This is only support for OSX and Linux, it does nothing on Windows.
+        This will print a message if setting the rlimit fails but will otherwise
+        continue execution, as some systems will already be configured correctly
+        and it is not necessarily a failure to not collect coredumps.
+    """
+    global coredump_pattern
+
+    if host_os == "OSX":
+        coredump_pattern = subprocess.check_output("sysctl -n kern.corefile", shell=True).rstrip()
+    elif host_os == "Linux":
+        with open("/proc/sys/kernel/core_pattern", "r") as f:
+            coredump_pattern = f.read().rstrip()
+    else:
+        print("CoreDump generation not enabled due to unsupported OS: %s" % host_os)
+        return
+
+    # resource is only available on Unix platforms
+    import resource
+
+    if coredump_pattern != "core" and coredump_pattern != "core.%P":
+        print("CoreDump generation not enabled due to unsupported coredump pattern: %s" % coredump_pattern)
+        return
+    else:
+        print("CoreDump pattern: %s" % coredump_pattern)
+
+    # We specify 'shell=True' as the command may otherwise fail (some systems will
+    # complain that the executable cannot be found in the current directory).
+    rlimit_core = subprocess.check_output("ulimit -c", shell=True).rstrip()
+
+    if rlimit_core != "unlimited":
+        try:
+            # This can fail on certain platforms. ARM64 in particular gives: "ValueError: not allowed to raise maximum limit"
+            resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
+        except:
+            print("Failed to enable CoreDump generation. rlimit_core: %s" % rlimit_core)
+            return
+
+        rlimit_core = subprocess.check_output("ulimit -c", shell=True).rstrip()
+
+        if rlimit_core != "unlimited":
+            print("Failed to enable CoreDump generation. rlimit_core: %s" % rlimit_core)
+            return
+
+    print("CoreDump generation enabled")
+
+    if host_os == "Linux" and os.path.isfile("/proc/self/coredump_filter"):
+        # Include memory in private and shared file-backed mappings in the dump.
+        # This ensures that we can see disassembly from our shared libraries when
+        # inspecting the contents of the dump. See 'man core' for details.
+        with open("/proc/self/coredump_filter", "w") as f:
+            f.write("0x3F")
+
+def print_info_from_coredump_file(host_os, arch, coredump_name, executable_name):
+    """ Prints information from the specified coredump to the console
+
+    Args:
+        host_os (String)         : os
+        arch (String)            : architecture
+        coredump_name (String)   : name of the coredump to print
+        executable_name (String) : name of the executable that generated the coredump
+
+    Notes:
+        This is only support for OSX and Linux, it does nothing on Windows.
+        This defaults to lldb on OSX and gdb on Linux.
+        For both lldb and db, it backtraces all threads. For gdb, it also prints local
+        information for every frame. This option is not available as a built-in for lldb.
+    """
+    if not os.path.isfile(executable_name):
+        print("Not printing coredump due to missing executable: %s" % executable_name)
+        return
+
+    if not os.path.isfile(coredump_name):
+        print("Not printing coredump due to missing coredump: %s" % coredump_name)
+        return
+
+    command = ""
+
+    if host_os == "OSX":
+        command = "lldb -c %s -b -o 'bt all' -o 'disassemble -b -p'" % coredump_name
+    elif host_os == "Linux":
+        command = "gdb --batch -ex \"thread apply all bt full\" -ex \"disassemble /r $pc\" -ex \"quit\" %s %s" % (executable_name, coredump_name)
+    else:
+        print("Not printing coredump due to unsupported OS: %s" % host_os)
+        return
+
+    print("Printing info from coredump: %s" % coredump_name)
+
+    proc_failed = False
+
+    try:
+        sys.stdout.flush() # flush output before creating sub-process
+
+        # We specify 'shell=True' as the command may otherwise fail (some systems will
+        # complain that the executable cannot be found in the current directory).
+        proc = subprocess.Popen(command, shell=True)
+        proc.communicate()
+
+        if proc.returncode != 0:
+            proc_failed = True
+    except:
+        proc_failed = True
+
+    if proc_failed:
+        print("Failed to print coredump: %s" % coredump_name)
+
+def preserve_coredump_file(coredump_name, root_storage_location="/tmp/coredumps_coreclr"):
+    """ Copies the specified coredump to a new randomly named temporary directory under
+        root_storage_location to ensure it is accessible after the workspace is cleaned.
+
+    Args:
+        coredump_name (String)         : name of the coredump to print
+        root_storage_location (String) : the directory under which to copy coredump_name
+
+    Notes:
+        root_storage_location defaults to a folder under /tmp to ensure that it is cleaned
+        up on next reboot (or after the OS configured time elapses for the folder).
+    """
+    if not os.path.exists(root_storage_location):
+        os.mkdir(root_storage_location)
+
+    # This creates a temporary directory under `root_storage_location` to ensure it doesn'tag
+    # conflict with any coredumps from past runs.
+    storage_location = tempfile.mkdtemp('', '', root_storage_location)
+
+    # Only preserve the dump if the directory is empty. Otherwise, do nothing.
+    # This is a way to prevent us from storing/uploading too many dumps.
+    if os.path.isfile(coredump_name) and not os.listdir(storage_location):
+        print("Copying coredump file %s to %s" % (coredump_name, storage_location))
+        shutil.copy2(coredump_name, storage_location)
+        # TODO: Support uploading to dumpling
+
+def inspect_and_delete_coredump_file(host_os, arch, coredump_name):
+    """ Prints information from the specified coredump and creates a backup of it
+
+    Args:
+        host_os (String)         : os
+        arch (String)            : architecture
+        coredump_name (String)   : name of the coredump to print
+    """
+    print_info_from_coredump_file(host_os, arch, coredump_name, "%s/corerun" % os.environ["CORE_ROOT"])
+    preserve_coredump_file(coredump_name)
+    os.remove(coredump_name)
+
+def inspect_and_delete_coredump_files(host_os, arch, test_location):
+    """ Finds all coredumps under test_location, prints some basic information about them
+        to the console, and creates a backup of the dumps for further investigation
+
+    Args:
+        host_os (String)         : os
+        arch (String)            : architecture
+        test_location (String)   : the folder under which to search for coredumps
+    """
+    # This function prints some basic information from core files in the current
+    # directory and deletes them immediately. Based on the state of the system, it may
+    # also upload a core file to the dumpling service.
+    # (see preserve_core_file).
+
+    # Depending on distro/configuration, the core files may either be named "core"
+    # or "core.<PID>" by default. We will read /proc/sys/kernel/core_uses_pid to 
+    # determine which one it is.
+    # On OS X/macOS, we checked the kern.corefile value before enabling core dump
+    # generation, so we know it always includes the PID.
+    coredump_name_uses_pid=False
+
+    print("Looking for coredumps...")
+
+    if "%P" in coredump_pattern:
+        coredump_name_uses_pid=True
+    elif host_os == "Linux" and os.path.isfile("/proc/sys/kernel/core_uses_pid"):
+        with open("/proc/sys/kernel/core_uses_pid", "r") as f:
+            if f.read().rstrip() == "1":
+                coredump_name_uses_pid=True
+
+    filter_pattern = ""
+    regex_pattern = ""
+    matched_file_count = 0
+
+    if coredump_name_uses_pid:
+        filter_pattern = "core.*"
+        regex_pattern = "core.[0-9]+"
+    else:
+        filter_pattern = "core"
+        regex_pattern = "core"
+
+    for dir_path, dir_names, file_names in os.walk(test_location):
+        for file_name in fnmatch.filter(file_names, filter_pattern):
+            if re.match(regex_pattern, file_name):
+                print("Found coredump: %s in %s" % (file_name, dir_path))
+                matched_file_count += 1
+                inspect_and_delete_coredump_file(host_os, arch, os.path.join(dir_path, file_name))
+
+    print("Found %s coredumps." % matched_file_count)
+
 def run_tests(host_os,
               arch,
               build_type, 
@@ -719,7 +931,8 @@ def run_tests(host_os,
               is_ilasm=False,
               is_illink=False,
               run_crossgen_tests=False,
-              run_sequential=False):
+              run_sequential=False,
+              limited_core_dumps=False):
     """ Run the coreclr tests
     
     Args:
@@ -766,6 +979,9 @@ def run_tests(host_os,
         print("Running GCStress, extending timeout to 120 minutes.")
         os.environ["__TestTimeout"] = str(120*60*1000) # 1,800,000 ms
 
+    if limited_core_dumps:
+        setup_coredump_generation(host_os)
+
     # Set Core_Root
     print("Setting CORE_ROOT=%s" % core_root)
     os.environ["CORE_ROOT"] = core_root
@@ -823,10 +1039,12 @@ def run_tests(host_os,
     # Call msbuild.
     return call_msbuild(coreclr_repo_location,
                         dotnetcli_location,
+                        test_location,
                         host_os,
                         arch,
                         build_type,
                         is_illink=is_illink,
+                        limited_core_dumps=limited_core_dumps,
                         sequential=run_sequential)
 
 def setup_args(args):
@@ -2044,6 +2262,7 @@ def do_setup(host_os,
               is_gcsimulator=unprocessed_args.gcsimulator,
               is_jitdasm=unprocessed_args.jitdisasm,
               is_ilasm=unprocessed_args.ilasmroundtrip,
+              limited_core_dumps=unprocessed_args.limited_core_dumps,
               run_sequential=unprocessed_args.sequential,
               run_crossgen_tests=unprocessed_args.run_crossgen_tests,
               test_env=test_env)

diff --git a/tests/runtest.sh b/tests/runtest.sh
@@ -554,6 +554,9 @@ if (($doCrossgen!=0)); then
     runtestPyArguments+=("--precompile_core_root")
 fi
 
+if [ "$limitedCoreDumps" == "ON" ]; then
+    runtestPyArguments+=("--limited_core_dumps")
+fi
 
 # Default to python3 if it is installed
 __Python=python