Add stacktrace printing using llvm-dwarfdump

We call llvm-dwarfdump at compile time in CMake, parse the output, post process it and save it in a binary file which contains a list of addresses and the corresponding line numbers and filename indexes. LFortran then reads this binary file when printing stacktrace. On macOS it speeds it up over 100x. There is no other dependency besides llvm-dwarfdump, which is part of LLVM.
suhanigarg29 · Mar 15, 2021 · 6ac0af0 · 6ac0af0
1 parent fa0417a
commit 6ac0af0
Show file tree

Hide file tree

Showing 6 changed files with 273 additions and 17 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -154,6 +154,8 @@ set(WITH_UNWIND no
     CACHE BOOL "Build with unwind support")
 set(WITH_BFD no
     CACHE BOOL "Build with BFD support")
+set(WITH_DWARFDUMP no
+    CACHE BOOL "Build with dwarfdump support")
 set(WITH_LINKH no
     CACHE BOOL "Build with link.h support")
 set(WITH_MACHO no
@@ -162,18 +164,24 @@ set(WITH_STACKTRACE no
     CACHE BOOL "Build with stacktrace support (requires binutils-dev)")
 if (WITH_STACKTRACE)
     set(WITH_UNWIND yes)
-    set(WITH_BFD yes)
     if (APPLE)
         set(WITH_MACHO yes)
+        if (NOT WITH_BFD)
+            set(WITH_DWARFDUMP yes)
+        endif()
     else()
         set(WITH_LINKH yes)
+        set(WITH_BFD yes)
     endif()
     set(HAVE_LFORTRAN_STACKTRACE yes)
 endif()
 if (WITH_BFD)
     find_package(BFD REQUIRED)
     set(HAVE_LFORTRAN_BFD yes)
 endif()
+if (WITH_DWARFDUMP)
+    set(HAVE_LFORTRAN_DWARFDUMP yes)
+endif()
 if (WITH_LINKH)
     find_package(LINKH REQUIRED)
     set(HAVE_LFORTRAN_LINK yes)
@@ -209,6 +217,7 @@ message("LFORTRAN_STATIC_BIN: ${LFORTRAN_STATIC_BIN}")
 message("WITH_STACKTRACE: ${WITH_STACKTRACE}")
 message("WITH_UNWIND: ${WITH_UNWIND}")
 message("WITH_BFD: ${WITH_BFD}")
+message("WITH_DWARFDUMP: ${WITH_DWARFDUMP}")
 message("WITH_LINKH: ${WITH_LINKH}")
 message("WITH_MACHO: ${WITH_MACHO}")
 message("HAVE_LFORTRAN_DEMANGLE: ${HAVE_LFORTRAN_DEMANGLE}")

diff --git a/src/bin/CMakeLists.txt b/src/bin/CMakeLists.txt
@@ -23,6 +23,18 @@ if (WITH_STACKTRACE AND APPLE AND CMAKE_CXX_COMPILER_ID MATCHES Clang)
         POST_BUILD
         COMMAND dsymutil lfortran
     )
+    if (WITH_DWARFDUMP)
+        add_custom_command(
+            TARGET lfortran
+            POST_BUILD
+            COMMAND llvm-dwarfdump --debug-line lfortran.dSYM > lfortran.dSYM/raw.txt
+        )
+        add_custom_command(
+            TARGET lfortran
+            POST_BUILD
+            COMMAND ./dwarf_convert.py lfortran.dSYM/raw.txt lfortran.dSYM/lines.txt lfortran.dSYM/lines.dat
+        )
+    endif()
 endif()
 
 # Ensure "Release" is not appended to the path on Windows:

diff --git a/src/bin/dwarf_convert.py b/src/bin/dwarf_convert.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+"""
+This script is used to convert the output of dwarfdump into a file that is easy
+to load into lfortran to lookup filenames and line numbers for a given address.
+Here is how to use it:
+
+        cd src/bin
+        llvm-dwarfdump --debug-line lfortran.dSYM > lfortran.dSYM/symbols.txt
+        ./dwarf_convert.py lfortran.dSYM/symbols.txt lfortran.dSYM/lines.txt lfortran.dSYM/lines.dat
+
+This is meant to be executed at build time.
+
+A better solution would be to use the `dwarf` library directly from C++ and
+generate the same output directly. Here is the source code of llvm-dwarfdump:
+
+https://github.com/llvm/llvm-project/blob/91a6ad5ad887a16e361338303d4ff3d29dba5e10/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+
+We just have to do exactly what it does, but generate the output in the format
+of lines.txt and lines.dat
+
+"""
+
+from collections import namedtuple
+from glob import glob
+import os
+import re
+from struct import pack
+import sys
+
+DebugLines = namedtuple("DebugLines", ["lines"])
+DebugLine = namedtuple("DebugLine", ["include_directories", "file_names", "addresses"])
+IncludeDirectory = namedtuple("IncludeDirectory", ["id", "path"])
+FileName = namedtuple("FileName", ["id", "filename", "dir_idx"])
+
+ASRDebugLines = namedtuple("ASRDebugLines", ["filenames", "addresses"])
+
+class Parser:
+    """
+    Parser for the output generated by dwarfdump.
+
+    On macOS (both Intel and ARM based):
+
+        dwarfdump --debug-line src/bin/lfortran.dSYM > symbols.txt
+
+    Then parse it using:
+
+        p = Parser()
+        ast = p.parse_file("symbols.txt")
+    """
+
+    def parse_file(self, filename):
+        self.file = open(filename)
+        self.line = self.file.readline()
+        while not self.line.startswith(".debug_line contents:"):
+            self.line = self.file.readline()
+
+        self.line = self.file.readline()
+        lines = []
+        while self.line.startswith("debug_line"):
+            d = self.parse_debug_line()
+            lines.append(d)
+        return DebugLines(lines)
+
+    def parse_debug_line(self):
+        self.line = self.file.readline()
+        while not self.line.startswith("include_directories"):
+            self.line = self.file.readline()
+
+        include_directories = []
+        while self.line.startswith("include_directories"):
+            n, path = re.compile(r"include_directories\[[ ]*(\d+)\] = \"([^\"]+)\"").findall(self.line)[0]
+            n = int(n)
+            include_directories.append(IncludeDirectory(n, path))
+            self.line = self.file.readline()
+
+        file_names = []
+        while self.line.startswith("file_names"):
+            n = re.compile(r"file_names\[[ ]*(\d+)\]:").findall(self.line)[0]
+            n = int(n)
+
+            self.line = self.file.readline()
+            filename = re.compile(r"name: \"([^\"]+)\"").findall(self.line)[0]
+
+            self.line = self.file.readline()
+            dir_idx = re.compile(r"dir_index: (\d+)").findall(self.line)[0]
+            dir_idx = int(dir_idx)
+
+            self.line = self.file.readline()
+            self.line = self.file.readline()
+
+            file_names.append(FileName(n, filename, dir_idx))
+
+            self.line = self.file.readline()
+
+        self.line = self.file.readline()
+        self.line = self.file.readline()
+        self.line = self.file.readline()
+
+        addresses = []
+        while self.line.startswith("0x"):
+            address, line, column, file_id = self.line.split()[:4]
+            address = int(address, base=16)
+            line = int(line)
+            column = int(column)
+            file_id = int(file_id)
+            addresses.append([address, line, column, file_id])
+            self.line = self.file.readline()
+
+        self.line = self.file.readline()
+
+        d = DebugLine(include_directories, file_names, addresses)
+        return d
+
+def ast_to_asr(ast):
+    local_files = glob("../../**/*.cpp", recursive=True) + \
+                  glob("../../**/*.h", recursive=True)
+    for i in range(len(local_files)):
+        local_files[i] = os.path.abspath(local_files[i])
+    def make_abs(end_path):
+        if end_path[0] != "/":
+            for f in local_files:
+                if f.endswith(end_path):
+                    return f
+        return end_path
+    lines = []
+    last_address = -1
+    global_filename_id = 0
+    global_filenames = []
+    global_addresses = []
+    for line in ast.lines:
+        include_dirs = {}
+        for inc in line.include_directories:
+            include_dirs[inc.id] = inc.path
+        filenames = {}
+        for filename in line.file_names:
+            prefix = ""
+            if filename.dir_idx != 0:
+                prefix = include_dirs[filename.dir_idx] + "/"
+            filenames[filename.id] = global_filename_id
+            global_filenames.append(make_abs(prefix+filename.filename))
+            global_filename_id += 1
+        for address, line_num, column, file_id in line.addresses:
+            filename = global_filenames[filenames[file_id]]
+            assert last_address <= address
+            last_address = address
+            if line_num != 0:
+                global_addresses.append([address, line_num, filenames[file_id]])
+    return ASRDebugLines(global_filenames, global_addresses)
+
+
+p = Parser()
+ast = p.parse_file(sys.argv[1])
+asr = ast_to_asr(ast)
+with open(sys.argv[2], "w") as f:
+    f.write(str(len(asr.filenames)) + "\n")
+    for filename in asr.filenames:
+        f.write(filename + "\n")
+    f.write(str(len(asr.addresses)) + "\n")
+with open(sys.argv[3], "wb") as f:
+    for addr, line, fileid in asr.addresses:
+        f.write(pack("3Q", addr, line, fileid))
diff --git a/src/lfortran/config.h.in b/src/lfortran/config.h.in
@@ -13,6 +13,7 @@
 /* Define if stacktrace is enabled */
 #cmakedefine HAVE_LFORTRAN_STACKTRACE
 #cmakedefine HAVE_LFORTRAN_BFD
+#cmakedefine HAVE_LFORTRAN_DWARFDUMP
 #cmakedefine HAVE_LFORTRAN_LINK
 #cmakedefine HAVE_LFORTRAN_MACHO
 #cmakedefine HAVE_LFORTRAN_UNWIND

diff --git a/src/lfortran/stacktrace.cpp b/src/lfortran/stacktrace.cpp
@@ -310,7 +310,7 @@ int load_symbol_table(bfd *abfd, line_data *data)
   return 0;
 }
 
-void get_symbol_info(std::string binary_filename, uintptr_t addr,
+void get_symbol_info_bfd(std::string binary_filename, uintptr_t addr,
   std::string &source_filename, std::string &function_name,
   int &line_number)
 {
@@ -438,15 +438,29 @@ std::string addr2str(const StacktraceItem &i)
         s << "  File unknown, absolute address: " << (void*) i.pc;
         s << color(style::reset);
     } else {
-        s << color(style::dim);
-        s << "  Binary file \"";
-        s << color(style::reset);
-        s << color(style::bold) << color(fg::magenta);
-        s << i.binary_filename;
-        s << color(fg::reset) << color(style::reset);
-        s << color(style::dim);
-        s << "\", local address: " << (void*) i.local_pc;
-        s << color(style::reset);
+        if (i.source_filename == "") {
+            s << color(style::dim);
+            s << "  Binary file \"";
+            s << color(style::reset);
+            s << color(style::bold) << color(fg::magenta);
+            s << i.binary_filename;
+            s << color(fg::reset) << color(style::reset);
+            s << color(style::dim);
+            s << "\", local address: " << (void*) i.local_pc;
+            s << color(style::reset);
+        } else {
+          // Nicely format the filename + line
+          s << color(style::dim) << "  File \"" << color(style::reset)
+            << color(style::bold) << color(fg::magenta) << i.source_filename
+            << color(fg::reset) << color(style::reset)
+            << color(style::dim) << "\", line " << i.line_number
+            << color(style::reset);
+          const std::string line_text = remove_leading_whitespace(
+            read_line_from_file(i.source_filename, i.line_number));
+          if (line_text != "") {
+            s << "\n    " << line_text;
+          }
+        }
     }
   } else if (i.source_filename == "") {
       // The file is unknown (and data.line == 0 in this case), so the
@@ -542,18 +556,76 @@ void get_local_addresses(std::vector<StacktraceItem> &d)
   }
 }
 
+void address_to_line_number(const std::vector<std::string> &filenames,
+          const std::vector<uint64_t> &addresses,
+          uintptr_t address,
+          std::string &filename,
+          int &line_number) {
+    int n = addresses.size() / 3;
+    // TODO: Using a bisection would be a lot faster: O(log(n) instead of O(n)
+    for (int i=0; i < n; i++) {
+      uint64_t addr, line, fileid;
+      addr = addresses[3*i+0];
+      line = addresses[3*i+1];
+      fileid = addresses[3*i+2];
+      if (addr > (address-8)) {
+        filename = filenames[fileid];
+        line_number = line;
+        return;
+      }
+    }
+    filename = "";
+    line_number = -1;
+}
+
+void get_local_info_dwarfdump(std::vector<StacktraceItem> &d)
+{
+  std::vector<std::string> filenames;
+  std::vector<uint64_t> addresses;
+  {
+    std::string filename = binary_executable_path + ".dSYM/lines.txt";
+    std::ifstream in;
+    in.open(filename);
+    if (!in.is_open()) {
+        return;
+    }
+    std::string s;
+    std::getline(in, s);
+    int n = std::stoi(s);
+    for (int i=0; i < n; i++) {
+      std::getline(in, s);
+      filenames.push_back(s);
+    }
+    std::getline(in, s);
+    n = std::stoi(s);
+
+    filename = binary_executable_path + ".dSYM/lines.dat";
+    std::ifstream in2;
+    in2.open(filename, std::ios::binary);
+    addresses.resize(3*n);
+    in2.read((char*)&addresses[0], 3*n*sizeof(uint64_t));
+  }
+  for (size_t i=0; i < d.size(); i++) {
+    address_to_line_number(filenames, addresses, d[i].local_pc,
+      d[i].source_filename, d[i].line_number);
+  }
+}
 
 void get_local_info(std::vector<StacktraceItem> &d)
 {
-#ifdef HAVE_LFORTRAN_BFD
+#ifdef HAVE_LFORTRAN_DWARFDUMP
+  get_local_info_dwarfdump(d);
+#else
+#  ifdef HAVE_LFORTRAN_BFD
   bfd_init();
-#endif
+#  endif
   for (size_t i=0; i < d.size(); i++) {
-#ifdef HAVE_LFORTRAN_BFD
-    get_symbol_info(d[i].binary_filename, d[i].local_pc,
+#  ifdef HAVE_LFORTRAN_BFD
+    get_symbol_info_bfd(d[i].binary_filename, d[i].local_pc,
       d[i].source_filename, d[i].function_name, d[i].line_number);
-#endif
+#  endif
   }
+#endif
 }
 
 } // namespace LFortran
diff --git a/src/lfortran/stacktrace.h b/src/lfortran/stacktrace.h
@@ -32,7 +32,7 @@ struct StacktraceItem
   uintptr_t local_pc=0; // 0 if not found
   std::string binary_filename; // "" if not found
 
-  // Sometimes this is found, but the next two are not
+  // This can be found or not
   std::string function_name; // "" if not found
 
   // The following two are either both found, or not found