diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f394bd403..39944553b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: if: matrix.regression-tests with: path: tests/regression/test_data - key: test-data-llvm${{ matrix.llvm }}-${{ hashFiles('tests/regression/test_specs/*') }} + key: test-data-llvm${{ matrix.llvm }}-bc-${{ hashFiles('tests/regression/test_specs/*') }} # Download kernel sources: # - if test_data is cached, we only need the kernel for unit tests @@ -113,4 +113,4 @@ jobs: uses: actions/cache@v4 with: path: tests/regression/test_data - key: test-data-llvm${{ matrix.llvm }}-${{ hashFiles('tests/regression/test_specs/*') }} + key: test-data-llvm${{ matrix.llvm }}-bc-${{ hashFiles('tests/regression/test_specs/*') }} diff --git a/.gitignore b/.gitignore index 6bde2226b..c46f59067 100644 --- a/.gitignore +++ b/.gitignore @@ -105,7 +105,7 @@ cmake-build-debug/ .vscode/ # Temporary files from regression tests. Used for debugging purposes only. -*.ll +*.bc *.smt2 *.pdf diff --git a/diffkemp/building/cc_wrapper/cc_wrapper.py b/diffkemp/building/cc_wrapper/cc_wrapper.py index 2eecb9edb..ef8b88248 100755 --- a/diffkemp/building/cc_wrapper/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper/cc_wrapper.py @@ -134,15 +134,15 @@ def wrapper(argv): contains_source = contains_source or is_source_file if index > 1 and argv[index - 1] == "-o": if is_object_file and not linking: - # Compiling to object file: swap .o with .ll - arg = arg.rsplit(".", 1)[0] + ".ll" + # Compiling to object file: swap .o with .bc + arg = arg.rsplit(".", 1)[0] + ".bc" if not is_object_file and linking: - # Linking: add a .llw suffix (LLVM IR whole) - arg = arg + ".llw" + # Linking: add a .bcw suffix (LLVM IR whole) + arg = arg + ".bcw" output_file = arg elif is_object_file and linking: - # Input to linking phase: change suffix to .ll - arg = arg.rsplit(".", 1)[0] + ".ll" + # Input to linking phase: change suffix to .bc + arg = arg.rsplit(".", 1)[0] + ".bc" clang = llvm_link elif is_source_file and linking: # Mark as linking with sources to detect hybrid mode @@ -153,16 +153,16 @@ def wrapper(argv): # Compile/link mode with object files detected # Drop object files and revert to normal compile/link mode clang = old_clang - clang_argv = [arg for arg in clang_argv if not arg.endswith(".ll")] + clang_argv = [arg for arg in clang_argv if not arg.endswith(".bc")] - # Do not continue if output is not .ll or .llw + # Do not continue if output is not .bc or .bcw # Note: this means that this is neither compilation nor linking - if (output_file is not None and not output_file.endswith(".ll") and - not output_file.endswith(".llw")): + if (output_file is not None and not output_file.endswith(".bc") and + not output_file.endswith(".bcw")): return 0 # Do not run clang on conftest files - if output_file in ["conftest.ll", "conftest.llw"] or "conftest.c" in argv: + if output_file in ["conftest.bc", "conftest.bcw"] or "conftest.c" in argv: return 0 # Not compiling C source file @@ -176,7 +176,7 @@ def wrapper(argv): elif not linking: # Compiling to default output file db.extend(["o:" + os.path.join(os.getcwd(), - arg.rsplit(".", 1)[0] + ".ll") + arg.rsplit(".", 1)[0] + ".bc") for arg in clang_argv if not arg.endswith(".c")]) # Analyze and modify parameters for clang (phase 2) @@ -192,7 +192,7 @@ def wrapper(argv): else: # Keep only arguments with input files (and llvm-link itself) clang_argv = [arg for arg in clang_argv if arg == clang or - arg.endswith(".ll") or arg.endswith(".llw") or + arg.endswith(".bc") or arg.endswith(".bcw") or arg == "-o"] # Remove non-existent files # Note: these might have been e.g. generated from assembly diff --git a/diffkemp/llvm_ir/compiler.py b/diffkemp/llvm_ir/compiler.py index 8e351d271..e67e35a85 100644 --- a/diffkemp/llvm_ir/compiler.py +++ b/diffkemp/llvm_ir/compiler.py @@ -8,7 +8,7 @@ def get_clang_default_options(default_optim=True): """Returns clang options for compiling c files to LLVM IR. :param default_optim: By default adds also optimization flags.""" - opts = ["-S", "-emit-llvm", "-g", "-fdebug-macro", "-Wno-format-security"] + opts = ["-c", "-emit-llvm", "-g", "-fdebug-macro", "-Wno-format-security"] if default_optim: opts.extend(["-O1", "-Xclang", "-disable-llvm-passes"]) return opts diff --git a/diffkemp/llvm_ir/kernel_llvm_source_builder.py b/diffkemp/llvm_ir/kernel_llvm_source_builder.py index 6999ff751..7ff1099cb 100644 --- a/diffkemp/llvm_ir/kernel_llvm_source_builder.py +++ b/diffkemp/llvm_ir/kernel_llvm_source_builder.py @@ -70,7 +70,7 @@ def find_llvm_with_symbol_def(self, symbol): llvm_filename = self._build_source_to_llvm(source_path) if os.path.isfile(llvm_filename): mod = LlvmModule(llvm_filename) - if mod.has_function(symbol) or mod.has_global(symbol): + if mod.has_definition(symbol): break except BuildException: pass @@ -395,9 +395,9 @@ def _gcc_to_llvm(gcc_command): if param.startswith('-D"DEBUG_HASH2='): param = '-D"DEBUG_HASH2=1"' - # Output name is given by replacing .c by .ll in source name + # Output name is given by replacing .c by .bc in source name if param.endswith(".c"): - output_file = "{}.ll".format(param[:-2]) + output_file = "{}.bc".format(param[:-2]) command.append(KernelLlvmSourceBuilder._strip_bash_quotes(param)) if output_file is None: @@ -414,10 +414,10 @@ def _ld_to_llvm(ld_command): :param ld_command: Command to convert :return Corresponding llvm-link command. """ - command = ["llvm-link", "-S"] + command = ["llvm-link"] for param in ld_command.split(): if param.endswith(".o"): - command.append("{}.ll".format(param[:-2])) + command.append("{}.bc".format(param[:-2])) elif param == "-o": command.append(param) return command @@ -486,8 +486,12 @@ def _get_build_object(command): @staticmethod def _get_build_source(command): - """Get name of the object file built by the command.""" - return command[command.index("-c") + 1] + """ + Get name of the source file built by the command. + The commands start with 'clang -c -emit-llvm' to avoid linking + and the source file follows '-c' later in the command. + """ + return command[command.index("-c", 3) + 1] def _kbuild_object_command(self, object_file): """ @@ -568,7 +572,7 @@ def _build_source_to_llvm(self, source_file): :param source_file: C source to build :return: Created LLVM IR file """ - llvm_file = "{}.ll".format(source_file[:-2]) + llvm_file = "{}.bc".format(source_file[:-2]) if (not os.path.isfile(llvm_file) or os.path.getmtime(llvm_file) < os.path.getmtime(source_file)): cwd = os.getcwd() @@ -627,7 +631,7 @@ def _build_kernel_mod_to_llvm(self, mod_dir, mod_name): obj = self._get_build_object(c) if not os.path.isfile(obj) or built: check_call(c, stderr=stderr) - llvm_file = os.path.join(mod_dir, "{}.ll".format(file_name)) + llvm_file = os.path.join(mod_dir, "{}.bc".format(file_name)) opt_llvm(llvm_file) return llvm_file except CalledProcessError: diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 01d0b45f4..bc24b7819 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -9,7 +9,9 @@ import os import re import shutil -from subprocess import check_call, CalledProcessError +from subprocess import check_call, check_output +from subprocess import CalledProcessError, PIPE, run + # Set of standard functions that are supported, so they should not be # included in function collecting. @@ -76,10 +78,10 @@ def link_modules(self, modules): return False if "-linked" not in self.llvm: - new_llvm = "{}-linked.ll".format(self.llvm[:-3]) + new_llvm = "{}-linked.bc".format(self.llvm[:-3]) else: new_llvm = self.llvm - link_command = ["llvm-link", "-S", self.llvm] + link_command = ["llvm-link", self.llvm] link_command.extend([m.llvm for m in link_llvm_modules]) link_command.extend(["-o", new_llvm]) opt_command = get_opt_command([("constmerge", "module")], new_llvm) @@ -112,17 +114,30 @@ def find_param_var(self, param): name = self.llvm_module.find_param_var(param) return LlvmParam(name, []) if name is not None else None + def module_has(self, symtype_pattern, symbol): + """ + Check if a module contains matches for a pattern. + Used in has_function, has_global and has_definition. + """ + command = ["llvm-nm", self.llvm] + source_dir = os.path.dirname(self.llvm) + nm_out = check_output(command, cwd=source_dir) + pattern = re.compile(rf"{symtype_pattern} {re.escape(symbol)}", + re.MULTILINE) + match = pattern.search(nm_out.decode()) + return match is not None + def has_function(self, fun): """Check if module contains a function definition.""" - pattern = re.compile(r"^define.*@{}\(".format(fun), flags=re.MULTILINE) - with open(self.llvm, "r") as llvm_file: - return pattern.search(llvm_file.read()) is not None + return self.module_has("[T|t]", fun) def has_global(self, glob): """Check if module contains a global variable with the given name.""" - pattern = re.compile(r"^@{}\s*=".format(glob), flags=re.MULTILINE) - with open(self.llvm, "r") as llvm_file: - return pattern.search(llvm_file.read()) is not None + return self.module_has("[DdBbCU]", glob) + + def has_definition(self, symbol): + """Check if module contains a given symbol definition.""" + return self.module_has("[DdBbCTt]", symbol) def is_declaration(self, fun): """ @@ -156,16 +171,21 @@ def move_to_other_root_dir(self, old_root, new_root): if self.llvm.startswith(old_root): dest_llvm = os.path.join(new_root, os.path.relpath(self.llvm, old_root)) - # Copy the .ll file and replace all occurrences of the old root by - # the new root. There are usually in debug info. - with open(self.llvm, "r") as llvm: - with open(dest_llvm, "w") as llvm_new: - for line in llvm.readlines(): - if "constant" not in line: - llvm_new.write(line.replace(old_root.strip("/"), - new_root.strip("/"))) - else: - llvm_new.write(line) + # Copy the .bc file and replace all occurrences of the old root by + # the new root. There are usually in debug info. Use textual + # LLVM IR to find the paths. + command = ["llvm-dis", self.llvm, "-o", "-"] + source_dir = os.path.dirname(self.llvm) + output = check_output(command, cwd=source_dir).decode() + new_lines = [] + for line in output.splitlines(): + if "constant" not in line: + new_lines.append(line.replace(old_root.strip("/"), + new_root.strip("/"))) + else: + new_lines.append(line) + run(["llvm-as", "-o", dest_llvm], input="\n".join(new_lines), + stdout=PIPE, stderr=PIPE, text=True, check=True) self.llvm = dest_llvm if self.source and self.source.startswith(old_root): @@ -178,18 +198,42 @@ def move_to_other_root_dir(self, old_root, new_root): def get_included_sources(self): """ Get the list of source files that this module includes. - Requires debugging information. + Sources are extracted from the llvm-bcanalyzer output (with --dump). + The are located in the first METADATA_BLOCK as STRINGS. + The first string is the file name, second is project directory, + the includes follow. """ - # Search for all .h files mentioned in the debug info. - pattern = re.compile(r"filename:\s*\"([^\"]*)\", " - r"directory:\s*\"([^\"]*)\"") + source_dir = ''.join(os.path.split(self.llvm)[0]) + command = ["llvm-bcanalyzer", self.llvm, "-dump"] + bc_out = check_output(command, cwd=source_dir) result = set() - with open(self.llvm, "r") as llvm: - for line in llvm.readlines(): - s = pattern.search(line) - if (s and (s.group(1).endswith(".h") or - s.group(1).endswith(".c"))): - result.add(os.path.join(s.group(2), s.group(1))) + in_metadata = False + in_strings = False + root_dir = "" + source_file = "" + for line in bc_out.decode().splitlines(): + line = line.strip() + if line.startswith("