From 948243737c02839be4e01848e3e8fb7f0962acaf Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Tue, 3 Jun 2025 11:16:49 +0200 Subject: [PATCH 01/10] Adapt tests for bitcode files Test for non-existing function fixed --- tests/regression/mock_source_tree.py | 6 +++--- tests/testing_projects/make_based/Makefile | 2 +- tests/unit_tests/build_test.py | 19 +++++++++++-------- .../kernel_llvm_source_builder_test.py | 8 ++++---- tests/unit_tests/kernel_module_test.py | 15 +++++++++------ tests/unit_tests/kernel_source_tree_test.py | 4 ++-- tests/unit_tests/source_tree_test.py | 14 +++++++------- 7 files changed, 37 insertions(+), 31 deletions(-) diff --git a/tests/regression/mock_source_tree.py b/tests/regression/mock_source_tree.py index b650a3ae2..3f9e29d79 100644 --- a/tests/regression/mock_source_tree.py +++ b/tests/regression/mock_source_tree.py @@ -22,7 +22,7 @@ def __init__(self, source_dir, real_source_tree): self.real_source_tree = real_source_tree def get_module_for_symbol(self, symbol, created_before=None): - llvm_file = os.path.join(self.source_dir, "{}.ll".format(symbol)) + llvm_file = os.path.join(self.source_dir, "{}.bc".format(symbol)) src_file = os.path.join(self.source_dir, "{}.c".format(symbol)) if not os.path.exists(llvm_file) and self.real_source_tree is not None: @@ -35,7 +35,7 @@ def get_module_for_symbol(self, symbol, created_before=None): return LlvmModule(llvm_file, src_file) def get_kernel_module(self, mod_dir, mod_name): - llvm_file = os.path.join(self.source_dir, "{}.ll".format(mod_name)) + llvm_file = os.path.join(self.source_dir, "{}.bc".format(mod_name)) if not os.path.exists(llvm_file): assert self.real_source_tree is not None @@ -45,7 +45,7 @@ def get_kernel_module(self, mod_dir, mod_name): return LlvmModule(llvm_file) def get_sysctl_module(self, sysctl): - llvm_file = os.path.join(self.source_dir, "{}.ll".format(sysctl)) + llvm_file = os.path.join(self.source_dir, "{}.bc".format(sysctl)) table_file = os.path.join(self.source_dir, "table") if not os.path.exists(llvm_file): diff --git a/tests/testing_projects/make_based/Makefile b/tests/testing_projects/make_based/Makefile index 962343e78..1cb252688 100644 --- a/tests/testing_projects/make_based/Makefile +++ b/tests/testing_projects/make_based/Makefile @@ -23,4 +23,4 @@ file.so: file.o $(CC) $(CFLAGS) -o $@ $^ -shared clean: - rm -f *.o *.ll *.so + rm -f *.o *.ll *.so *.bc diff --git a/tests/unit_tests/build_test.py b/tests/unit_tests/build_test.py index abd0d0569..751b409f2 100644 --- a/tests/unit_tests/build_test.py +++ b/tests/unit_tests/build_test.py @@ -5,6 +5,8 @@ import pytest import re import yaml +from subprocess import Popen, PIPE + SINGLE_C_FILE = os.path.abspath("tests/testing_projects/make_based/file.c") MAKE_BASED_PROJECT_DIR = os.path.abspath("tests/testing_projects/make_based") @@ -41,11 +43,12 @@ def get_db_file_content(snapshot_dir): def get_llvm_fun_body(llvm_file, fun_name): """Returns body of fun_name function from llvm_file.""" - with open(llvm_file, "r") as file: - content = file.read() - match = re.search(r"define.*@" + re.escape(fun_name) + r"[ (][^}]*", - content, re.MULTILINE) - return match.group(0) + cat_out = Popen(["cat", llvm_file], stdout=PIPE, text=True) + dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, text=True) + output, error = dis_out.communicate() + match = re.search(r"define.*@" + re.escape(fun_name) + r"[ (][^}]*", + output, re.MULTILINE) + return match.group(0) @pytest.mark.parametrize("source", @@ -60,11 +63,11 @@ def test_build_command(source, tmp_path): # Snapshot should contain source file, LLVM file and file with metadata. output_files = os.listdir(output_dir) assert "file.c" in output_files - assert "file.ll" in output_files + assert "file.bc" in output_files assert "snapshot.yaml" in output_files # Functions from the source file should be also in the llvm file. - llvm_file_path = os.path.join(output_dir, "file.ll") + llvm_file_path = os.path.join(output_dir, "file.bc") llvm_fun_list = get_functions_from_llvm([llvm_file_path]) assert "add" in llvm_fun_list assert "mul" in llvm_fun_list @@ -122,6 +125,6 @@ def test_build_no_opt_override(tmp_path): # With --no-opt-override the optimization level which is written # in Makefile should be used, because it is -O2 the `add` function # should not be called from the `mul` function` (should be "inlined"). - llvm_file_path = os.path.join(output_dir, "file.ll") + llvm_file_path = os.path.join(output_dir, "file.bc") body = get_llvm_fun_body(llvm_file_path, "mul") assert re.search(r"call.*@add", body) is None diff --git a/tests/unit_tests/kernel_llvm_source_builder_test.py b/tests/unit_tests/kernel_llvm_source_builder_test.py index 232025d46..bf079af7d 100644 --- a/tests/unit_tests/kernel_llvm_source_builder_test.py +++ b/tests/unit_tests/kernel_llvm_source_builder_test.py @@ -38,13 +38,13 @@ def test_find_llvm_with_symbol_def(builder): Test building LLVM module from a source containing a function definition. """ llvm_file = builder.find_llvm_with_symbol_def("__alloc_workqueue_key") - assert llvm_file == os.path.join(builder.source_dir, "kernel/workqueue.ll") + assert llvm_file == os.path.join(builder.source_dir, "kernel/workqueue.bc") def test_find_llvm_with_symbol_use(builder): """Test finding sources using a global variable.""" srcs = builder.find_llvm_with_symbol_use("net_ratelimit_state") - assert srcs == {os.path.join(builder.source_dir, "net/core/utils.ll")} + assert srcs == {os.path.join(builder.source_dir, "net/core/utils.bc")} def test_build_cscope_database(builder): @@ -97,7 +97,7 @@ def test_kbuild_module_commands(builder): def test_build_src_to_llvm(builder): """Building single object into LLVM.""" llvm_file = builder._build_source_to_llvm("sound/core/init.c") - assert (llvm_file == "sound/core/init.ll") + assert (llvm_file == "sound/core/init.bc") assert os.path.isfile(os.path.join(builder.source_dir, llvm_file)) @@ -110,7 +110,7 @@ def test_build_src_to_llvm_fail(builder): def test_build_mod_to_llvm(builder): """Test building kernel module into LLVM""" mod_file = os.path.join(builder.source_dir, - "drivers/firewire/firewire-sbp2.ll") + "drivers/firewire/firewire-sbp2.bc") if os.path.isfile(mod_file): os.unlink(mod_file) builder._build_kernel_mod_to_llvm("drivers/firewire", "firewire-sbp2") diff --git a/tests/unit_tests/kernel_module_test.py b/tests/unit_tests/kernel_module_test.py index 3f7857ccb..84964abc9 100644 --- a/tests/unit_tests/kernel_module_test.py +++ b/tests/unit_tests/kernel_module_test.py @@ -10,6 +10,7 @@ import pytest import shutil import tempfile +from subprocess import Popen, PIPE @pytest.fixture @@ -84,14 +85,16 @@ def test_move_to_other_root_dir(source): # Check that source (C and LLVM) files have been moved. mod.move_to_other_root_dir(os.path.abspath("kernel/linux-3.10.0-957.el7"), tmp) - assert os.path.isfile(os.path.join(tmp, "sound/core/init.ll")) + assert os.path.isfile(os.path.join(tmp, "sound/core/init.bc")) # Check that the llvm file does not contain the original directory. - assert mod.llvm == os.path.join(tmp, "sound/core/init.ll") - with open(mod.llvm, "r") as llvm: - for line in llvm.readlines(): - assert ("constant" in line or - "kernel/linux-3.10.0-957.el7" not in line) + assert mod.llvm == os.path.join(tmp, "sound/core/init.bc") + cat_out = Popen(["cat", mod.llvm], stdout=PIPE, text=True) + dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, text=True) + output, error = dis_out.communicate() + for line in output.splitlines(): + assert ("constant" in line or + "kernel/linux-3.10.0-957.el7" not in line) shutil.rmtree(tmp) diff --git a/tests/unit_tests/kernel_source_tree_test.py b/tests/unit_tests/kernel_source_tree_test.py index 2677ac4e0..74da09746 100644 --- a/tests/unit_tests/kernel_source_tree_test.py +++ b/tests/unit_tests/kernel_source_tree_test.py @@ -18,10 +18,10 @@ def source(): @pytest.mark.parametrize("name, llvm_file, table", [ ("net.core.message_burst", - "net/core/sysctl_net_core.ll", + "net/core/sysctl_net_core.bc", "net_core_table"), ("kernel.usermodehelper.bset", - "kernel/kmod.ll", + "kernel/kmod.bc", "usermodehelper_table") ]) def test_get_sysctl_module(source, name, llvm_file, table): diff --git a/tests/unit_tests/source_tree_test.py b/tests/unit_tests/source_tree_test.py index 099f704fd..1ebd8f0ec 100644 --- a/tests/unit_tests/source_tree_test.py +++ b/tests/unit_tests/source_tree_test.py @@ -30,7 +30,7 @@ def test_get_module_for_symbol(source): """ mod = source.get_module_for_symbol("__alloc_workqueue_key") assert mod is not None - assert mod.llvm == os.path.join(source.source_dir, "kernel/workqueue.ll") + assert mod.llvm == os.path.join(source.source_dir, "kernel/workqueue.bc") assert mod.has_function("__alloc_workqueue_key") @@ -41,7 +41,7 @@ def test_get_module_for_symbol_built_after(source): The LLVM file should not be retrieved. """ before_time = datetime.datetime.now() - datetime.timedelta(minutes=1) - llvm_file = os.path.join(source.source_dir, "kernel/workqueue.ll") + llvm_file = os.path.join(source.source_dir, "kernel/workqueue.bc") # Temporarily change mtime of the LLVM IR file to now stat = os.stat(llvm_file) @@ -60,7 +60,7 @@ def test_get_module_for_symbol_built_after(source): def test_get_module_for_symbol_fail(source): """Test getting LLVM module for a non-existing function definition.""" with pytest.raises(SourceNotFoundException): - source.get_module_for_symbol("__get_user_2") + source.get_module_for_symbol("__get_user_3") def test_get_modules_using_symbol(source): @@ -71,8 +71,8 @@ def test_get_modules_using_symbol(source): assert len(mods) == 2 assert set([m.llvm for m in mods]) == \ set([os.path.join(source.source_dir, f) - for f in ["net/core/sock.ll", - "net/netfilter/ipvs/ip_vs_sync.ll"]]) + for f in ["net/core/sock.bc", + "net/netfilter/ipvs/ip_vs_sync.bc"]]) def test_copy_source_files(source): @@ -87,8 +87,8 @@ def test_copy_source_files(source): assert os.path.isdir(os.path.join(tmp_source.source_dir, d)) # Check that files were successfully copied. - for f in ["net/core/utils.c", "net/core/utils.ll", "kernel/workqueue.c", - "kernel/workqueue.ll", "include/linux/module.h", + for f in ["net/core/utils.c", "net/core/utils.bc", "kernel/workqueue.c", + "kernel/workqueue.bc", "include/linux/module.h", "include/linux/kernel.h"]: assert os.path.isfile(os.path.join(tmp_source.source_dir, f)) From 33f3e7e418bb1f8a253be474fcc57f4de3fb66b1 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Tue, 3 Jun 2025 10:35:49 +0200 Subject: [PATCH 02/10] Use bitcode (.bc) files instead of .ll files in snapshots Added has_definition function --- diffkemp/building/cc_wrapper.py | 14 ++-- diffkemp/llvm_ir/compiler.py | 2 +- .../llvm_ir/kernel_llvm_source_builder.py | 14 ++-- diffkemp/llvm_ir/llvm_module.py | 79 +++++++++++++------ diffkemp/llvm_ir/single_c_builder.py | 2 +- diffkemp/utils.py | 16 ++-- 6 files changed, 78 insertions(+), 49 deletions(-) diff --git a/diffkemp/building/cc_wrapper.py b/diffkemp/building/cc_wrapper.py index 2eecb9edb..3348d52ef 100755 --- a/diffkemp/building/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper.py @@ -135,14 +135,14 @@ def wrapper(argv): if index > 1 and argv[index - 1] == "-o": if is_object_file and not linking: # Compiling to object file: swap .o with .ll - arg = arg.rsplit(".", 1)[0] + ".ll" + arg = arg.rsplit(".", 1)[0] + ".bc" if not is_object_file and linking: # Linking: add a .llw suffix (LLVM IR whole) arg = arg + ".llw" output_file = arg elif is_object_file and linking: # Input to linking phase: change suffix to .ll - arg = arg.rsplit(".", 1)[0] + ".ll" + arg = arg.rsplit(".", 1)[0] + ".bc" clang = llvm_link elif is_source_file and linking: # Mark as linking with sources to detect hybrid mode @@ -153,16 +153,16 @@ def wrapper(argv): # Compile/link mode with object files detected # Drop object files and revert to normal compile/link mode clang = old_clang - clang_argv = [arg for arg in clang_argv if not arg.endswith(".ll")] + clang_argv = [arg for arg in clang_argv if not arg.endswith(".bc")] # Do not continue if output is not .ll or .llw # Note: this means that this is neither compilation nor linking - if (output_file is not None and not output_file.endswith(".ll") and + if (output_file is not None and not output_file.endswith(".bc") and not output_file.endswith(".llw")): return 0 # Do not run clang on conftest files - if output_file in ["conftest.ll", "conftest.llw"] or "conftest.c" in argv: + if output_file in ["conftest.bc", "conftest.llw"] or "conftest.c" in argv: return 0 # Not compiling C source file @@ -176,7 +176,7 @@ def wrapper(argv): elif not linking: # Compiling to default output file db.extend(["o:" + os.path.join(os.getcwd(), - arg.rsplit(".", 1)[0] + ".ll") + arg.rsplit(".", 1)[0] + ".bc") for arg in clang_argv if not arg.endswith(".c")]) # Analyze and modify parameters for clang (phase 2) @@ -192,7 +192,7 @@ def wrapper(argv): else: # Keep only arguments with input files (and llvm-link itself) clang_argv = [arg for arg in clang_argv if arg == clang or - arg.endswith(".ll") or arg.endswith(".llw") or + arg.endswith(".bc") or arg.endswith(".llw") or arg == "-o"] # Remove non-existent files # Note: these might have been e.g. generated from assembly diff --git a/diffkemp/llvm_ir/compiler.py b/diffkemp/llvm_ir/compiler.py index 8e351d271..e67e35a85 100644 --- a/diffkemp/llvm_ir/compiler.py +++ b/diffkemp/llvm_ir/compiler.py @@ -8,7 +8,7 @@ def get_clang_default_options(default_optim=True): """Returns clang options for compiling c files to LLVM IR. :param default_optim: By default adds also optimization flags.""" - opts = ["-S", "-emit-llvm", "-g", "-fdebug-macro", "-Wno-format-security"] + opts = ["-c", "-emit-llvm", "-g", "-fdebug-macro", "-Wno-format-security"] if default_optim: opts.extend(["-O1", "-Xclang", "-disable-llvm-passes"]) return opts diff --git a/diffkemp/llvm_ir/kernel_llvm_source_builder.py b/diffkemp/llvm_ir/kernel_llvm_source_builder.py index 6999ff751..66f50d7ee 100644 --- a/diffkemp/llvm_ir/kernel_llvm_source_builder.py +++ b/diffkemp/llvm_ir/kernel_llvm_source_builder.py @@ -70,7 +70,7 @@ def find_llvm_with_symbol_def(self, symbol): llvm_filename = self._build_source_to_llvm(source_path) if os.path.isfile(llvm_filename): mod = LlvmModule(llvm_filename) - if mod.has_function(symbol) or mod.has_global(symbol): + if mod.has_definition(symbol): break except BuildException: pass @@ -397,7 +397,7 @@ def _gcc_to_llvm(gcc_command): # Output name is given by replacing .c by .ll in source name if param.endswith(".c"): - output_file = "{}.ll".format(param[:-2]) + output_file = "{}.bc".format(param[:-2]) command.append(KernelLlvmSourceBuilder._strip_bash_quotes(param)) if output_file is None: @@ -414,10 +414,10 @@ def _ld_to_llvm(ld_command): :param ld_command: Command to convert :return Corresponding llvm-link command. """ - command = ["llvm-link", "-S"] + command = ["llvm-link"] for param in ld_command.split(): if param.endswith(".o"): - command.append("{}.ll".format(param[:-2])) + command.append("{}.bc".format(param[:-2])) elif param == "-o": command.append(param) return command @@ -487,7 +487,7 @@ def _get_build_object(command): @staticmethod def _get_build_source(command): """Get name of the object file built by the command.""" - return command[command.index("-c") + 1] + return command[-1] def _kbuild_object_command(self, object_file): """ @@ -568,7 +568,7 @@ def _build_source_to_llvm(self, source_file): :param source_file: C source to build :return: Created LLVM IR file """ - llvm_file = "{}.ll".format(source_file[:-2]) + llvm_file = "{}.bc".format(source_file[:-2]) if (not os.path.isfile(llvm_file) or os.path.getmtime(llvm_file) < os.path.getmtime(source_file)): cwd = os.getcwd() @@ -627,7 +627,7 @@ def _build_kernel_mod_to_llvm(self, mod_dir, mod_name): obj = self._get_build_object(c) if not os.path.isfile(obj) or built: check_call(c, stderr=stderr) - llvm_file = os.path.join(mod_dir, "{}.ll".format(file_name)) + llvm_file = os.path.join(mod_dir, "{}.bc".format(file_name)) opt_llvm(llvm_file) return llvm_file except CalledProcessError: diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 01d0b45f4..8e1f570d7 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -9,7 +9,9 @@ import os import re import shutil -from subprocess import check_call, CalledProcessError +from subprocess import check_call, check_output +from subprocess import CalledProcessError, Popen, PIPE + # Set of standard functions that are supported, so they should not be # included in function collecting. @@ -76,10 +78,10 @@ def link_modules(self, modules): return False if "-linked" not in self.llvm: - new_llvm = "{}-linked.ll".format(self.llvm[:-3]) + new_llvm = "{}-linked.bc".format(self.llvm[:-3]) else: new_llvm = self.llvm - link_command = ["llvm-link", "-S", self.llvm] + link_command = ["llvm-link", self.llvm] link_command.extend([m.llvm for m in link_llvm_modules]) link_command.extend(["-o", new_llvm]) opt_command = get_opt_command([("constmerge", "module")], new_llvm) @@ -114,15 +116,30 @@ def find_param_var(self, param): def has_function(self, fun): """Check if module contains a function definition.""" - pattern = re.compile(r"^define.*@{}\(".format(fun), flags=re.MULTILINE) - with open(self.llvm, "r") as llvm_file: - return pattern.search(llvm_file.read()) is not None + command = ["llvm-nm", self.llvm] + source_dir = os.path.dirname(self.llvm) + nm_out = check_output(command, cwd=source_dir) + pattern = re.compile(rf"[T|t] {re.escape(fun)}$", re.MULTILINE) + match = pattern.search(nm_out.decode()) + return match is not None def has_global(self, glob): """Check if module contains a global variable with the given name.""" - pattern = re.compile(r"^@{}\s*=".format(glob), flags=re.MULTILINE) - with open(self.llvm, "r") as llvm_file: - return pattern.search(llvm_file.read()) is not None + command = ["llvm-nm", self.llvm] + source_dir = os.path.dirname(self.llvm) + nm_out = check_output(command, cwd=source_dir) + pattern = re.compile(rf"[DdBbCU] {re.escape(glob)}$", re.MULTILINE) + match = pattern.search(nm_out.decode()) + return match is not None + + def has_definition(self, symbol): + """Check if module contains a given symbol definition.""" + command = ["llvm-nm", self.llvm] + source_dir = os.path.dirname(self.llvm) + nm_out = check_output(command, cwd=source_dir) + pattern = re.compile(rf"[DdBbCTt] {re.escape(symbol)}$", re.MULTILINE) + match = pattern.search(nm_out.decode()) + return match is not None def is_declaration(self, fun): """ @@ -158,14 +175,21 @@ def move_to_other_root_dir(self, old_root, new_root): os.path.relpath(self.llvm, old_root)) # Copy the .ll file and replace all occurrences of the old root by # the new root. There are usually in debug info. - with open(self.llvm, "r") as llvm: - with open(dest_llvm, "w") as llvm_new: - for line in llvm.readlines(): - if "constant" not in line: - llvm_new.write(line.replace(old_root.strip("/"), - new_root.strip("/"))) - else: - llvm_new.write(line) + cat_out = Popen(["cat", self.llvm], stdout=PIPE, text=True) + dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, + text=True) + output, _ = dis_out.communicate() + new_file = [] + for line in output.splitlines(): + if "constant" not in line: + new_file.append(line.replace(old_root.strip("/"), + new_root.strip("/"))) + else: + new_file.append(line) + as_out = Popen(["llvm-as", "-o", dest_llvm], stdin=PIPE, + stdout=PIPE, stderr=PIPE, text=True) + as_out.communicate(input="\n".join(new_file)) + output, _ = as_out.communicate() self.llvm = dest_llvm if self.source and self.source.startswith(old_root): @@ -180,16 +204,19 @@ def get_included_sources(self): Get the list of source files that this module includes. Requires debugging information. """ - # Search for all .h files mentioned in the debug info. - pattern = re.compile(r"filename:\s*\"([^\"]*)\", " - r"directory:\s*\"([^\"]*)\"") + source_dir = ''.join(os.path.split(self.llvm)[0]) + command = ["llvm-bcanalyzer", self.llvm, "-dump"] + source_dir = os.path.dirname(self.llvm) + bc_out = check_output(command, cwd=source_dir) + root_dir = re.search(r"^\s*'/.*'$", bc_out.decode(), + flags=re.MULTILINE) + root_dir = root_dir.group(0).replace(' ', '').replace("'", '') + matches = set(re.findall(r"^\s*'.*\.(?:h|c)'", bc_out.decode(), + flags=re.MULTILINE)) result = set() - with open(self.llvm, "r") as llvm: - for line in llvm.readlines(): - s = pattern.search(line) - if (s and (s.group(1).endswith(".h") or - s.group(1).endswith(".c"))): - result.add(os.path.join(s.group(2), s.group(1))) + for m in matches: + match = m.replace(' ', '').replace("'", '') + result.add(os.path.join(root_dir, match)) return result def get_functions_using_param(self, param): diff --git a/diffkemp/llvm_ir/single_c_builder.py b/diffkemp/llvm_ir/single_c_builder.py index 001ae1398..00d5efe44 100644 --- a/diffkemp/llvm_ir/single_c_builder.py +++ b/diffkemp/llvm_ir/single_c_builder.py @@ -22,7 +22,7 @@ def __init__(self, source_dir, c_file_name, clang="clang", :param default_optim: use default optimalisations flags and run LLVM IR simplification passes """ - llvm_file_name = os.path.splitext(c_file_name)[0] + ".ll" + llvm_file_name = os.path.splitext(c_file_name)[0] + ".bc" SingleLlvmFinder.__init__(self, source_dir, llvm_file_name) self.c_file_name = c_file_name diff --git a/diffkemp/utils.py b/diffkemp/utils.py index 2c193da2e..7c1e15af2 100644 --- a/diffkemp/utils.py +++ b/diffkemp/utils.py @@ -2,8 +2,9 @@ import subprocess import re import sys +from subprocess import check_output -LLVM_FUNCTION_REGEX = re.compile(r"^define.*@(\w+)\(", flags=re.MULTILINE) +LLVM_FUNCTION_REGEX = re.compile(r"^.{16} [T|t] (\w+)", flags=re.MULTILINE) def get_simpll_build_dir(): @@ -40,7 +41,7 @@ def get_opt_command(passes, llvm_file, overwrite=True): pass_names = map(lambda p: p[0], passes) opt_command.extend(map(lambda pass_name: f"-{pass_name}", pass_names)) if overwrite: - opt_command.extend(["-S", "-o", llvm_file]) + opt_command.extend(["-o", llvm_file]) return opt_command @@ -94,9 +95,10 @@ def get_functions_from_llvm(llvm_files): sys.stderr.write( f"Warning: llvm file '{llvm_filename}' does not exist\n") continue - with open(llvm_filename, 'r') as llvm_file: - llvm_file_content = llvm_file.read() - matches = LLVM_FUNCTION_REGEX.findall(llvm_file_content) - for match in matches: - functions[match] = llvm_filename + command = ["llvm-nm", llvm_filename] + source_dir = ''.join(os.path.split(llvm_filename)[0]) + nm_out = check_output(command, cwd=source_dir) + matches = LLVM_FUNCTION_REGEX.findall(nm_out.decode()) + for match in matches: + functions[match] = llvm_filename return functions From f610f11025ab1c557a301a8d145819a65eb49405 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Thu, 10 Jul 2025 22:56:05 +0200 Subject: [PATCH 03/10] Remove unnecessary pipes, update regex, formatting --- diffkemp/building/cc_wrapper.py | 6 +-- .../llvm_ir/kernel_llvm_source_builder.py | 2 +- diffkemp/llvm_ir/llvm_module.py | 50 +++++++++---------- diffkemp/utils.py | 3 +- tests/unit_tests/build_test.py | 8 +-- tests/unit_tests/kernel_module_test.py | 8 +-- 6 files changed, 38 insertions(+), 39 deletions(-) diff --git a/diffkemp/building/cc_wrapper.py b/diffkemp/building/cc_wrapper.py index 3348d52ef..adc34fd4c 100755 --- a/diffkemp/building/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper.py @@ -134,14 +134,14 @@ def wrapper(argv): contains_source = contains_source or is_source_file if index > 1 and argv[index - 1] == "-o": if is_object_file and not linking: - # Compiling to object file: swap .o with .ll + # Compiling to object file: swap .o with .bc arg = arg.rsplit(".", 1)[0] + ".bc" if not is_object_file and linking: # Linking: add a .llw suffix (LLVM IR whole) arg = arg + ".llw" output_file = arg elif is_object_file and linking: - # Input to linking phase: change suffix to .ll + # Input to linking phase: change suffix to .bc arg = arg.rsplit(".", 1)[0] + ".bc" clang = llvm_link elif is_source_file and linking: @@ -155,7 +155,7 @@ def wrapper(argv): clang = old_clang clang_argv = [arg for arg in clang_argv if not arg.endswith(".bc")] - # Do not continue if output is not .ll or .llw + # Do not continue if output is not .bc or .llw # Note: this means that this is neither compilation nor linking if (output_file is not None and not output_file.endswith(".bc") and not output_file.endswith(".llw")): diff --git a/diffkemp/llvm_ir/kernel_llvm_source_builder.py b/diffkemp/llvm_ir/kernel_llvm_source_builder.py index 66f50d7ee..cfdeb50ee 100644 --- a/diffkemp/llvm_ir/kernel_llvm_source_builder.py +++ b/diffkemp/llvm_ir/kernel_llvm_source_builder.py @@ -395,7 +395,7 @@ def _gcc_to_llvm(gcc_command): if param.startswith('-D"DEBUG_HASH2='): param = '-D"DEBUG_HASH2=1"' - # Output name is given by replacing .c by .ll in source name + # Output name is given by replacing .c by .bc in source name if param.endswith(".c"): output_file = "{}.bc".format(param[:-2]) diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 8e1f570d7..9039df999 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -10,7 +10,7 @@ import re import shutil from subprocess import check_call, check_output -from subprocess import CalledProcessError, Popen, PIPE +from subprocess import CalledProcessError, PIPE, run # Set of standard functions that are supported, so they should not be @@ -114,32 +114,31 @@ def find_param_var(self, param): name = self.llvm_module.find_param_var(param) return LlvmParam(name, []) if name is not None else None - def has_function(self, fun): - """Check if module contains a function definition.""" + def module_has(self, pattern): + """ + Check if a module contains matches for a pattern. + Used in has_function, has_global and has_definition. + """ command = ["llvm-nm", self.llvm] source_dir = os.path.dirname(self.llvm) nm_out = check_output(command, cwd=source_dir) - pattern = re.compile(rf"[T|t] {re.escape(fun)}$", re.MULTILINE) match = pattern.search(nm_out.decode()) return match is not None + def has_function(self, fun): + """Check if module contains a function definition.""" + pattern = re.compile(rf"[T|t] {re.escape(fun)}$", re.MULTILINE) + return self.module_has(pattern) + def has_global(self, glob): """Check if module contains a global variable with the given name.""" - command = ["llvm-nm", self.llvm] - source_dir = os.path.dirname(self.llvm) - nm_out = check_output(command, cwd=source_dir) pattern = re.compile(rf"[DdBbCU] {re.escape(glob)}$", re.MULTILINE) - match = pattern.search(nm_out.decode()) - return match is not None + return self.module_has(pattern) def has_definition(self, symbol): """Check if module contains a given symbol definition.""" - command = ["llvm-nm", self.llvm] - source_dir = os.path.dirname(self.llvm) - nm_out = check_output(command, cwd=source_dir) pattern = re.compile(rf"[DdBbCTt] {re.escape(symbol)}$", re.MULTILINE) - match = pattern.search(nm_out.decode()) - return match is not None + return self.module_has(pattern) def is_declaration(self, fun): """ @@ -173,12 +172,12 @@ def move_to_other_root_dir(self, old_root, new_root): if self.llvm.startswith(old_root): dest_llvm = os.path.join(new_root, os.path.relpath(self.llvm, old_root)) - # Copy the .ll file and replace all occurrences of the old root by - # the new root. There are usually in debug info. - cat_out = Popen(["cat", self.llvm], stdout=PIPE, text=True) - dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, - text=True) - output, _ = dis_out.communicate() + # Copy the .bc file and replace all occurrences of the old root by + # the new root. There are usually in debug info. Use textual + # LLVM IR to find the paths. + command = ["llvm-dis", self.llvm, "-o", "-"] + source_dir = os.path.dirname(self.llvm) + output = check_output(command, cwd=source_dir).decode() new_file = [] for line in output.splitlines(): if "constant" not in line: @@ -186,10 +185,8 @@ def move_to_other_root_dir(self, old_root, new_root): new_root.strip("/"))) else: new_file.append(line) - as_out = Popen(["llvm-as", "-o", dest_llvm], stdin=PIPE, - stdout=PIPE, stderr=PIPE, text=True) - as_out.communicate(input="\n".join(new_file)) - output, _ = as_out.communicate() + run(["llvm-as", "-o", dest_llvm], input="\n".join(new_file), + stdout=PIPE, stderr=PIPE, text=True, check=True) self.llvm = dest_llvm if self.source and self.source.startswith(old_root): @@ -210,12 +207,13 @@ def get_included_sources(self): bc_out = check_output(command, cwd=source_dir) root_dir = re.search(r"^\s*'/.*'$", bc_out.decode(), flags=re.MULTILINE) - root_dir = root_dir.group(0).replace(' ', '').replace("'", '') + root_dir = root_dir.group(0).strip()[1:-1] matches = set(re.findall(r"^\s*'.*\.(?:h|c)'", bc_out.decode(), flags=re.MULTILINE)) + result = set() for m in matches: - match = m.replace(' ', '').replace("'", '') + match = m.strip()[1:-1] result.add(os.path.join(root_dir, match)) return result diff --git a/diffkemp/utils.py b/diffkemp/utils.py index 7c1e15af2..436fa85a0 100644 --- a/diffkemp/utils.py +++ b/diffkemp/utils.py @@ -4,7 +4,8 @@ import sys from subprocess import check_output -LLVM_FUNCTION_REGEX = re.compile(r"^.{16} [T|t] (\w+)", flags=re.MULTILINE) +LLVM_FUNCTION_REGEX = re.compile(r"^.* [T|t] ([\w|\.|\$]+)", + flags=re.MULTILINE) def get_simpll_build_dir(): diff --git a/tests/unit_tests/build_test.py b/tests/unit_tests/build_test.py index 751b409f2..0135a7887 100644 --- a/tests/unit_tests/build_test.py +++ b/tests/unit_tests/build_test.py @@ -5,7 +5,7 @@ import pytest import re import yaml -from subprocess import Popen, PIPE +from subprocess import check_output SINGLE_C_FILE = os.path.abspath("tests/testing_projects/make_based/file.c") @@ -43,9 +43,9 @@ def get_db_file_content(snapshot_dir): def get_llvm_fun_body(llvm_file, fun_name): """Returns body of fun_name function from llvm_file.""" - cat_out = Popen(["cat", llvm_file], stdout=PIPE, text=True) - dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, text=True) - output, error = dis_out.communicate() + command = ["llvm-dis", llvm_file, "-o", "-"] + source_dir = os.path.dirname(llvm_file) + output = check_output(command, cwd=source_dir).decode() match = re.search(r"define.*@" + re.escape(fun_name) + r"[ (][^}]*", output, re.MULTILINE) return match.group(0) diff --git a/tests/unit_tests/kernel_module_test.py b/tests/unit_tests/kernel_module_test.py index 84964abc9..5150ae254 100644 --- a/tests/unit_tests/kernel_module_test.py +++ b/tests/unit_tests/kernel_module_test.py @@ -10,7 +10,7 @@ import pytest import shutil import tempfile -from subprocess import Popen, PIPE +from subprocess import check_output @pytest.fixture @@ -89,9 +89,9 @@ def test_move_to_other_root_dir(source): # Check that the llvm file does not contain the original directory. assert mod.llvm == os.path.join(tmp, "sound/core/init.bc") - cat_out = Popen(["cat", mod.llvm], stdout=PIPE, text=True) - dis_out = Popen(["llvm-dis"], stdin=cat_out.stdout, stdout=PIPE, text=True) - output, error = dis_out.communicate() + command = ["llvm-dis", mod.llvm, "-o", "-"] + source_dir = os.path.dirname(mod.llvm) + output = check_output(command, cwd=source_dir).decode() for line in output.splitlines(): assert ("constant" in line or "kernel/linux-3.10.0-957.el7" not in line) From 158aa40a6ec819724e5c0bd6e6dc33fbda158a03 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Thu, 24 Jul 2025 14:40:09 +0200 Subject: [PATCH 04/10] Modify search for included sources --- diffkemp/llvm_ir/llvm_module.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 9039df999..830f4f6fd 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -205,16 +205,34 @@ def get_included_sources(self): command = ["llvm-bcanalyzer", self.llvm, "-dump"] source_dir = os.path.dirname(self.llvm) bc_out = check_output(command, cwd=source_dir) - root_dir = re.search(r"^\s*'/.*'$", bc_out.decode(), - flags=re.MULTILINE) - root_dir = root_dir.group(0).strip()[1:-1] - matches = set(re.findall(r"^\s*'.*\.(?:h|c)'", bc_out.decode(), - flags=re.MULTILINE)) - result = set() - for m in matches: - match = m.strip()[1:-1] - result.add(os.path.join(root_dir, match)) + in_metadata = False + in_strings = False + root_dir = "" + source_file = "" + for line in bc_out.decode().splitlines(): + line = line.strip() + if line.startswith(" Date: Wed, 13 Aug 2025 09:43:27 +0200 Subject: [PATCH 05/10] Stop creating .llw files --- .gitignore | 2 +- diffkemp/building/cc_wrapper.py | 13 ++--- tests/testing_projects/.gitignore | 2 +- tests/testing_projects/make_based/Makefile | 7 +-- tests/testing_projects/make_based/mod.S | 2 +- tests/testing_projects/make_based/sub.s | 2 +- tests/unit_tests/build_test.py | 24 ++-------- tests/unit_tests/caching_test.py | 22 ++++----- tests/unit_tests/snapshot_test.py | 56 +++++++++++----------- 9 files changed, 54 insertions(+), 76 deletions(-) diff --git a/.gitignore b/.gitignore index 73e4afbba..fa0a20577 100644 --- a/.gitignore +++ b/.gitignore @@ -102,7 +102,7 @@ cmake-build-debug/ *.swp # Temporary files from regression tests. Used for debugging purposes only. -*.ll +*.bc *.smt2 *.pdf diff --git a/diffkemp/building/cc_wrapper.py b/diffkemp/building/cc_wrapper.py index adc34fd4c..d6279f368 100755 --- a/diffkemp/building/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper.py @@ -136,9 +136,6 @@ def wrapper(argv): if is_object_file and not linking: # Compiling to object file: swap .o with .bc arg = arg.rsplit(".", 1)[0] + ".bc" - if not is_object_file and linking: - # Linking: add a .llw suffix (LLVM IR whole) - arg = arg + ".llw" output_file = arg elif is_object_file and linking: # Input to linking phase: change suffix to .bc @@ -155,14 +152,13 @@ def wrapper(argv): clang = old_clang clang_argv = [arg for arg in clang_argv if not arg.endswith(".bc")] - # Do not continue if output is not .bc or .llw + # Do not continue if output is not .bc # Note: this means that this is neither compilation nor linking - if (output_file is not None and not output_file.endswith(".bc") and - not output_file.endswith(".llw")): + if (output_file is not None and not output_file.endswith(".bc")): return 0 # Do not run clang on conftest files - if output_file in ["conftest.bc", "conftest.llw"] or "conftest.c" in argv: + if output_file == "conftest.bc" or "conftest.c" in argv: return 0 # Not compiling C source file @@ -192,8 +188,7 @@ def wrapper(argv): else: # Keep only arguments with input files (and llvm-link itself) clang_argv = [arg for arg in clang_argv if arg == clang or - arg.endswith(".bc") or arg.endswith(".llw") or - arg == "-o"] + arg.endswith(".bc") or arg == "-o"] # Remove non-existent files # Note: these might have been e.g. generated from assembly new_clang_argv = [clang_argv[0], "-S"] diff --git a/tests/testing_projects/.gitignore b/tests/testing_projects/.gitignore index 0011d19ad..30a937537 100644 --- a/tests/testing_projects/.gitignore +++ b/tests/testing_projects/.gitignore @@ -1,5 +1,5 @@ *.o # Files created by the build command -*.ll +*.bc function_list diff --git a/tests/testing_projects/make_based/Makefile b/tests/testing_projects/make_based/Makefile index 1cb252688..fb03a0385 100644 --- a/tests/testing_projects/make_based/Makefile +++ b/tests/testing_projects/make_based/Makefile @@ -6,12 +6,9 @@ CFLAGS = -O2 -std=c99 default: file.o -# Assembly files should not be compiled to `.ll` +# Assembly files should not be compiled to `.bc` with-assembly: default mod.o sub.o -# Compilation with linking, should be compiled to `.llw` -with-linking: default file.so - # Note: mod.o (mod.S) is implicitly compiled using CC, but sub.o (sub.s) # is implicilty compiled using AS therefore for sub.o is neccessary to # add explicit target to use CC (for testing cc_wrapper behaviour). @@ -23,4 +20,4 @@ file.so: file.o $(CC) $(CFLAGS) -o $@ $^ -shared clean: - rm -f *.o *.ll *.so *.bc + rm -f *.o *.so *.bc diff --git a/tests/testing_projects/make_based/mod.S b/tests/testing_projects/make_based/mod.S index f54e4f34c..e7aa0d8e2 100644 --- a/tests/testing_projects/make_based/mod.S +++ b/tests/testing_projects/make_based/mod.S @@ -1,5 +1,5 @@ // File for testing of building of snapshots, -// this file should NOT be compiled to .ll and added to db file. +// this file should NOT be compiled to .bc and added to db file. // Note: File was compiled from a C file, trimmed and edited. .text .p2align 4 diff --git a/tests/testing_projects/make_based/sub.s b/tests/testing_projects/make_based/sub.s index 8354d684c..14e65a601 100644 --- a/tests/testing_projects/make_based/sub.s +++ b/tests/testing_projects/make_based/sub.s @@ -1,5 +1,5 @@ # File for testing of building of snapshots, -# this file should NOT be compiled to .ll and added to db file. +# this file should NOT be compiled to .bc and added to db file. # Note: File was compiled from a C file and trimmed. .text .p2align 4 diff --git a/tests/unit_tests/build_test.py b/tests/unit_tests/build_test.py index 0135a7887..6db9cd55d 100644 --- a/tests/unit_tests/build_test.py +++ b/tests/unit_tests/build_test.py @@ -92,28 +92,14 @@ def test_make_based_with_assembly(tmp_path): args = Arguments(MAKE_BASED_PROJECT_DIR, output_dir, target=["with-assembly"]) build(args) - # .s, .S files should not be compiled to .ll + # .s, .S files should not be compiled to .bc src_dir_files = os.listdir(MAKE_BASED_PROJECT_DIR) - assert "mod.ll" not in src_dir_files - assert "sub.ll" not in src_dir_files + assert "mod.bc" not in src_dir_files + assert "sub.bc" not in src_dir_files # and they should not be added to db file db_file_content = get_db_file_content(output_dir) - assert "mod.ll" not in db_file_content - assert "sub.ll" not in db_file_content - - -def test_make_based_with_linking(tmp_path): - """Tests make based project which contains linking of file(s).""" - output_dir = str(tmp_path) - args = Arguments(MAKE_BASED_PROJECT_DIR, output_dir, - target=["with-linking"]) - build(args) - # When linking *.o files, appropriate .ll files - # should be linked with llvm-link and saved as `.llw`. - # Note: The .llw is not copied to snapshot dir. - assert "file.so.llw" in os.listdir(MAKE_BASED_PROJECT_DIR) - db_file_content = get_db_file_content(output_dir) - assert "file.so.llw" in db_file_content + assert "mod.bc" not in db_file_content + assert "sub.bc" not in db_file_content def test_build_no_opt_override(tmp_path): diff --git a/tests/unit_tests/caching_test.py b/tests/unit_tests/caching_test.py index ae6789ef1..4d7d30545 100644 --- a/tests/unit_tests/caching_test.py +++ b/tests/unit_tests/caching_test.py @@ -269,14 +269,14 @@ def test_cachability_reset_after_absorb(graph_uncachable): @pytest.fixture def cache_file(): - yield SimpLLCache.CacheFile(mkdtemp(), "/test/f1/1.ll", "/test/f2/2.ll") + yield SimpLLCache.CacheFile(mkdtemp(), "/test/f1/1.bc", "/test/f2/2.bc") def test_cache_file_init(cache_file): """Tests the constructor of the SimpLLCache.CacheFile class.""" - assert cache_file.left_module == "/test/f1/1.ll" - assert cache_file.right_module == "/test/f2/2.ll" - assert cache_file.filename.endswith("..$f1$1.ll:..$f2$2.ll") + assert cache_file.left_module == "/test/f1/1.bc" + assert cache_file.right_module == "/test/f2/2.bc" + assert cache_file.filename.endswith("..$f1$1.bc:..$f2$2.bc") def test_cache_file_add_function_pairs(cache_file): @@ -302,27 +302,27 @@ def simpll_cache(): def vertices(): yield [ComparisonGraph.Vertex(dup("f"), Result.Kind.EQUAL, - ("/test/f1/1.ll", "/test/f2/2.ll")), + ("/test/f1/1.bc", "/test/f2/2.bc")), ComparisonGraph.Vertex(dup("h"), Result.Kind.NOT_EQUAL, - ("/test/f1/1.ll", "/test/f2/3.ll")), + ("/test/f1/1.bc", "/test/f2/3.bc")), ComparisonGraph.Vertex(dup("g"), Result.Kind.NOT_EQUAL, - ("/test/f1/1.ll", "/test/f2/2.ll"))] + ("/test/f1/1.bc", "/test/f2/2.bc"))] def test_simpll_cache_update(simpll_cache, vertices): """Tests updating of a SimpLL cache with vertices from a graph.""" simpll_cache.update(vertices) assert os.path.exists(os.path.join(simpll_cache.directory, - "..$f1$1.ll:..$f2$2.ll")) + "..$f1$1.bc:..$f2$2.bc")) assert os.path.exists(os.path.join(simpll_cache.directory, - "..$f1$1.ll:..$f2$3.ll")) + "..$f1$1.bc:..$f2$3.bc")) with open(os.path.join(simpll_cache.directory, - "..$f1$1.ll:..$f2$2.ll"), "r") as file: + "..$f1$1.bc:..$f2$2.bc"), "r") as file: assert file.readlines() == ["f:f\n", "g:g\n"] with open(os.path.join(simpll_cache.directory, - "..$f1$1.ll:..$f2$3.ll"), "r") as file: + "..$f1$1.bc:..$f2$3.bc"), "r") as file: assert file.readlines() == ["h:h\n"] diff --git a/tests/unit_tests/snapshot_test.py b/tests/unit_tests/snapshot_test.py index fe8304167..d556281d0 100644 --- a/tests/unit_tests/snapshot_test.py +++ b/tests/unit_tests/snapshot_test.py @@ -51,11 +51,11 @@ def test_load_snapshot_from_dir_functions(): list_kind: function list: - glob_var: null - llvm: net/core/skbuff.ll + llvm: net/core/skbuff.bc name: ___pskb_trim tag: null - glob_var: null - llvm: mm/page_alloc.ll + llvm: mm/page_alloc.bc name: __alloc_pages_nodemask tag: null llvm_source_finder: @@ -87,10 +87,10 @@ def test_load_snapshot_from_dir_functions(): assert f.tag is None if name == "___pskb_trim": assert os.path.abspath(f.mod.llvm) == snap_dir + \ - "/net/core/skbuff.ll" + "/net/core/skbuff.bc" elif name == "__alloc_pages_nodemask": assert os.path.abspath(f.mod.llvm) == snap_dir + \ - "/mm/page_alloc.ll" + "/mm/page_alloc.bc" def test_load_snapshot_from_dir_sysctls(): @@ -111,13 +111,13 @@ def test_load_snapshot_from_dir_sysctls(): list: - functions: - glob_var: null - llvm: kernel/sched/fair.ll + llvm: kernel/sched/fair.bc name: sched_proc_update_handler tag: proc handler sysctl: kernel.sched_latency_ns - functions: - glob_var: null - llvm: kernel/sysctl.ll + llvm: kernel/sysctl.bc name: proc_dointvec_minmax tag: proc handler sysctl: kernel.timer_migration @@ -145,12 +145,12 @@ def test_load_snapshot_from_dir_sysctls(): assert g.functions.keys() == {"sched_proc_update_handler"} f = g.functions["sched_proc_update_handler"] assert os.path.abspath(f.mod.llvm) == snap_dir + \ - "/kernel/sched/fair.ll" + "/kernel/sched/fair.bc" elif name == "kernel.timer_migration": assert g.functions.keys() == {"proc_dointvec_minmax"} f = g.functions["proc_dointvec_minmax"] assert os.path.abspath(f.mod.llvm) == snap_dir + \ - "/kernel/sysctl.ll" + "/kernel/sysctl.bc" assert f.tag == "proc handler" assert f.glob_var is None @@ -163,7 +163,7 @@ def test_add_fun_none_group(): snap = Snapshot.create_from_source(source, output_dir, "function", True, False) - mod = LlvmModule("net/core/skbuff.ll") + mod = LlvmModule("net/core/skbuff.bc") snap.add_fun("___pskb_trim", mod) assert "___pskb_trim" in snap.fun_groups[None].functions @@ -181,7 +181,7 @@ def test_add_fun_sysctl_group(): snap = Snapshot.create_from_source(source, output_dir, "sysctl", True, False) - mod = LlvmModule("kernel/sched/debug.ll") + mod = LlvmModule("kernel/sched/debug.bc") snap.add_fun("sched_debug_header", mod, "sysctl_sched_latency", @@ -210,16 +210,16 @@ def test_get_modules(): "sysctl", True, False) snap.add_fun("sched_proc_update_handler", - LlvmModule("kernel/sched/fair.ll"), None, + LlvmModule("kernel/sched/fair.bc"), None, "proc_handler", "kernel.sched_latency_ns") - snap.add_fun("proc_dointvec_minmax", LlvmModule("kernel/sysctl.ll"), + snap.add_fun("proc_dointvec_minmax", LlvmModule("kernel/sysctl.bc"), None, "proc_handler", "kernel.timer_migration") modules = snap.modules() assert len(modules) == 2 - assert set([m.llvm for m in modules]) == {"kernel/sched/fair.ll", - "kernel/sysctl.ll"} + assert set([m.llvm for m in modules]) == {"kernel/sched/fair.bc", + "kernel/sysctl.bc"} def test_get_by_name_functions(): @@ -230,8 +230,8 @@ def test_get_by_name_functions(): snap = Snapshot.create_from_source(source, output_dir, "function", True, False) - mod_buff = LlvmModule("net/core/skbuff.ll") - mod_alloc = LlvmModule("mm/page_alloc.ll") + mod_buff = LlvmModule("net/core/skbuff.bc") + mod_alloc = LlvmModule("mm/page_alloc.bc") snap.add_fun("___pskb_trim", mod_buff) snap.add_fun("__alloc_pages_nodemask", mod_alloc) @@ -250,9 +250,9 @@ def test_get_by_name_sysctls(): "sysctl", True, False) mod_fair = LlvmModule( - "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sched/fair.ll") + "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sched/fair.bc") mod_sysctl = LlvmModule( - "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sysctl.ll") + "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sysctl.bc") snap.add_fun("sched_proc_update_handler", mod_fair, None, "proc handler", "kernel.sched_latency_ns") snap.add_fun("proc_dointvec_minmax", mod_sysctl, None, "proc handler", @@ -273,9 +273,9 @@ def test_filter(): snap = Snapshot.create_from_source(source, output_dir, "function", True, False) - snap.add_fun("___pskb_trim", LlvmModule("net/core/skbuff.ll")) + snap.add_fun("___pskb_trim", LlvmModule("net/core/skbuff.bc")) snap.add_fun("__alloc_pages_nodemask", - LlvmModule("mm/page_alloc.ll")) + LlvmModule("mm/page_alloc.bc")) snap.filter(["__alloc_pages_nodemask"]) assert len(snap.fun_groups[None].functions) == 1 @@ -299,9 +299,9 @@ def test_to_yaml_functions(): "function", True, False) snap.add_fun("___pskb_trim", LlvmModule( - "snapshots/linux-3.10.0-957.el7/net/core/skbuff.ll")) + "snapshots/linux-3.10.0-957.el7/net/core/skbuff.bc")) snap.add_fun("__alloc_pages_nodemask", LlvmModule( - "snapshots/linux-3.10.0-957.el7/mm/page_alloc.ll")) + "snapshots/linux-3.10.0-957.el7/mm/page_alloc.bc")) yaml_str = snap.to_yaml() yaml_snap = yaml.safe_load(yaml_str) @@ -318,9 +318,9 @@ def test_to_yaml_functions(): for f in yaml_dict["list"]: if f["name"] == "___pskb_trim": - assert f["llvm"] == "net/core/skbuff.ll" + assert f["llvm"] == "net/core/skbuff.bc" elif f["name"] == "__alloc_pages_nodemask": - assert f["llvm"] == "mm/page_alloc.ll" + assert f["llvm"] == "mm/page_alloc.bc" def test_to_yaml_sysctls(): @@ -341,11 +341,11 @@ def test_to_yaml_sysctls(): snap.add_fun("sched_proc_update_handler", LlvmModule( "snapshots-sysctl/linux-3.10.0-957.el7/" - "kernel/sched/fair.ll"), + "kernel/sched/fair.bc"), None, "proc handler", "kernel.sched_latency_ns") snap.add_fun("proc_dointvec_minmax", LlvmModule( - "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sysctl.ll"), + "snapshots-sysctl/linux-3.10.0-957.el7/kernel/sysctl.bc"), None, "proc handler", "kernel.timer_migration") yaml_str = snap.to_yaml() @@ -365,14 +365,14 @@ def test_to_yaml_sysctls(): if g["sysctl"] == "kernel.sched_latency_ns": assert g["functions"][0] == { "name": "sched_proc_update_handler", - "llvm": "kernel/sched/fair.ll", + "llvm": "kernel/sched/fair.bc", "glob_var": None, "tag": "proc handler" } elif g["sysctl"] == "kernel.timer_migration": assert g["functions"][0] == { "name": "proc_dointvec_minmax", - "llvm": "kernel/sysctl.ll", + "llvm": "kernel/sysctl.bc", "glob_var": None, "tag": "proc handler" } From c0bebfd1b1cd17e272da44d00b2b2ffa3e3a412c Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Wed, 13 Aug 2025 10:56:39 +0200 Subject: [PATCH 06/10] Avoid caching to run tests correctly --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f394bd403..39944553b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: if: matrix.regression-tests with: path: tests/regression/test_data - key: test-data-llvm${{ matrix.llvm }}-${{ hashFiles('tests/regression/test_specs/*') }} + key: test-data-llvm${{ matrix.llvm }}-bc-${{ hashFiles('tests/regression/test_specs/*') }} # Download kernel sources: # - if test_data is cached, we only need the kernel for unit tests @@ -113,4 +113,4 @@ jobs: uses: actions/cache@v4 with: path: tests/regression/test_data - key: test-data-llvm${{ matrix.llvm }}-${{ hashFiles('tests/regression/test_specs/*') }} + key: test-data-llvm${{ matrix.llvm }}-bc-${{ hashFiles('tests/regression/test_specs/*') }} From 9abffbbf3149c6774e0f065eae7258867397c6dd Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Fri, 22 Aug 2025 15:54:52 +0200 Subject: [PATCH 07/10] Change matching loop --- diffkemp/llvm_ir/llvm_module.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 830f4f6fd..73093b705 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -224,15 +224,15 @@ def get_included_sources(self): # Extract paths: 1st path is source file name, # 2nd is project directory string = line[1:-1] - if source_file and (string.endswith(".h") or string.endswith(".c") - ) and not string.startswith("/"): - result.add(os.path.join(root_dir, string)) if not source_file: source_file = string elif not root_dir: root_dir = string - # Add source file when project directory is known - result.add(os.path.join(root_dir, source_file)) + # Add source file when project directory is known + result.add(os.path.join(root_dir, source_file)) + elif (string.endswith(".h") or string.endswith(".c") + ) and not string.startswith("/"): + result.add(os.path.join(root_dir, string)) return result def get_functions_using_param(self, param): From 0ab8168246ef2f8a3eac992c8dc2d751d01a38a8 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Tue, 2 Sep 2025 20:26:08 +0200 Subject: [PATCH 08/10] Simplify code and improve docstrings --- diffkemp/llvm_ir/llvm_module.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/diffkemp/llvm_ir/llvm_module.py b/diffkemp/llvm_ir/llvm_module.py index 73093b705..bc24b7819 100644 --- a/diffkemp/llvm_ir/llvm_module.py +++ b/diffkemp/llvm_ir/llvm_module.py @@ -114,7 +114,7 @@ def find_param_var(self, param): name = self.llvm_module.find_param_var(param) return LlvmParam(name, []) if name is not None else None - def module_has(self, pattern): + def module_has(self, symtype_pattern, symbol): """ Check if a module contains matches for a pattern. Used in has_function, has_global and has_definition. @@ -122,23 +122,22 @@ def module_has(self, pattern): command = ["llvm-nm", self.llvm] source_dir = os.path.dirname(self.llvm) nm_out = check_output(command, cwd=source_dir) + pattern = re.compile(rf"{symtype_pattern} {re.escape(symbol)}", + re.MULTILINE) match = pattern.search(nm_out.decode()) return match is not None def has_function(self, fun): """Check if module contains a function definition.""" - pattern = re.compile(rf"[T|t] {re.escape(fun)}$", re.MULTILINE) - return self.module_has(pattern) + return self.module_has("[T|t]", fun) def has_global(self, glob): """Check if module contains a global variable with the given name.""" - pattern = re.compile(rf"[DdBbCU] {re.escape(glob)}$", re.MULTILINE) - return self.module_has(pattern) + return self.module_has("[DdBbCU]", glob) def has_definition(self, symbol): """Check if module contains a given symbol definition.""" - pattern = re.compile(rf"[DdBbCTt] {re.escape(symbol)}$", re.MULTILINE) - return self.module_has(pattern) + return self.module_has("[DdBbCTt]", symbol) def is_declaration(self, fun): """ @@ -178,14 +177,14 @@ def move_to_other_root_dir(self, old_root, new_root): command = ["llvm-dis", self.llvm, "-o", "-"] source_dir = os.path.dirname(self.llvm) output = check_output(command, cwd=source_dir).decode() - new_file = [] + new_lines = [] for line in output.splitlines(): if "constant" not in line: - new_file.append(line.replace(old_root.strip("/"), - new_root.strip("/"))) + new_lines.append(line.replace(old_root.strip("/"), + new_root.strip("/"))) else: - new_file.append(line) - run(["llvm-as", "-o", dest_llvm], input="\n".join(new_file), + new_lines.append(line) + run(["llvm-as", "-o", dest_llvm], input="\n".join(new_lines), stdout=PIPE, stderr=PIPE, text=True, check=True) self.llvm = dest_llvm @@ -199,11 +198,13 @@ def move_to_other_root_dir(self, old_root, new_root): def get_included_sources(self): """ Get the list of source files that this module includes. - Requires debugging information. + Sources are extracted from the llvm-bcanalyzer output (with --dump). + The are located in the first METADATA_BLOCK as STRINGS. + The first string is the file name, second is project directory, + the includes follow. """ source_dir = ''.join(os.path.split(self.llvm)[0]) command = ["llvm-bcanalyzer", self.llvm, "-dump"] - source_dir = os.path.dirname(self.llvm) bc_out = check_output(command, cwd=source_dir) result = set() in_metadata = False From 535138441b907165ae7d78e3c1339ad3623c0348 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Tue, 2 Sep 2025 20:41:42 +0200 Subject: [PATCH 09/10] Use .bcw files --- diffkemp/building/cc_wrapper.py | 11 ++++++++--- tests/testing_projects/make_based/Makefile | 3 +++ tests/unit_tests/build_test.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/diffkemp/building/cc_wrapper.py b/diffkemp/building/cc_wrapper.py index d6279f368..a1080332c 100755 --- a/diffkemp/building/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper.py @@ -136,6 +136,9 @@ def wrapper(argv): if is_object_file and not linking: # Compiling to object file: swap .o with .bc arg = arg.rsplit(".", 1)[0] + ".bc" + if not is_object_file and linking: + # Linking: add a .bcw suffix (LLVM IR whole) + arg = arg + ".bcw" output_file = arg elif is_object_file and linking: # Input to linking phase: change suffix to .bc @@ -154,11 +157,12 @@ def wrapper(argv): # Do not continue if output is not .bc # Note: this means that this is neither compilation nor linking - if (output_file is not None and not output_file.endswith(".bc")): + if (output_file is not None and not output_file.endswith(".bc") and + not output_file.endswith(".bcw")): return 0 # Do not run clang on conftest files - if output_file == "conftest.bc" or "conftest.c" in argv: + if output_file in ["conftest.bc", "conftest.bcw"] or "conftest.c" in argv: return 0 # Not compiling C source file @@ -188,7 +192,8 @@ def wrapper(argv): else: # Keep only arguments with input files (and llvm-link itself) clang_argv = [arg for arg in clang_argv if arg == clang or - arg.endswith(".bc") or arg == "-o"] + arg.endswith(".bc") or arg.endswith(".bcw") or + arg == "-o"] # Remove non-existent files # Note: these might have been e.g. generated from assembly new_clang_argv = [clang_argv[0], "-S"] diff --git a/tests/testing_projects/make_based/Makefile b/tests/testing_projects/make_based/Makefile index fb03a0385..257ebae38 100644 --- a/tests/testing_projects/make_based/Makefile +++ b/tests/testing_projects/make_based/Makefile @@ -9,6 +9,9 @@ default: file.o # Assembly files should not be compiled to `.bc` with-assembly: default mod.o sub.o +# Compilation with linking, should be compiled to `.bcw` +with-linking: default file.so + # Note: mod.o (mod.S) is implicitly compiled using CC, but sub.o (sub.s) # is implicilty compiled using AS therefore for sub.o is neccessary to # add explicit target to use CC (for testing cc_wrapper behaviour). diff --git a/tests/unit_tests/build_test.py b/tests/unit_tests/build_test.py index 6db9cd55d..186a5db10 100644 --- a/tests/unit_tests/build_test.py +++ b/tests/unit_tests/build_test.py @@ -102,6 +102,20 @@ def test_make_based_with_assembly(tmp_path): assert "sub.bc" not in db_file_content +def test_make_based_with_linking(tmp_path): + """Tests make based project which contains linking of file(s).""" + output_dir = str(tmp_path) + args = Arguments(MAKE_BASED_PROJECT_DIR, output_dir, + target=["with-linking"]) + build(args) + # When linking *.o files, appropriate .ll files + # should be linked with llvm-link and saved as `.bcw`. + # Note: The .bcw is not copied to snapshot dir. + assert "file.so.bcw" in os.listdir(MAKE_BASED_PROJECT_DIR) + db_file_content = get_db_file_content(output_dir) + assert "file.so.bcw" in db_file_content + + def test_build_no_opt_override(tmp_path): """Testing 'build' --no-opt-override argument.""" output_dir = tmp_path From b121125639182ae284832354ca6c0ecff1858ef2 Mon Sep 17 00:00:00 2001 From: xtrnov01 Date: Thu, 11 Sep 2025 12:57:55 +0200 Subject: [PATCH 10/10] Add better descriptions --- diffkemp/building/cc_wrapper/cc_wrapper.py | 2 +- diffkemp/llvm_ir/kernel_llvm_source_builder.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/diffkemp/building/cc_wrapper/cc_wrapper.py b/diffkemp/building/cc_wrapper/cc_wrapper.py index a1080332c..ef8b88248 100755 --- a/diffkemp/building/cc_wrapper/cc_wrapper.py +++ b/diffkemp/building/cc_wrapper/cc_wrapper.py @@ -155,7 +155,7 @@ def wrapper(argv): clang = old_clang clang_argv = [arg for arg in clang_argv if not arg.endswith(".bc")] - # Do not continue if output is not .bc + # Do not continue if output is not .bc or .bcw # Note: this means that this is neither compilation nor linking if (output_file is not None and not output_file.endswith(".bc") and not output_file.endswith(".bcw")): diff --git a/diffkemp/llvm_ir/kernel_llvm_source_builder.py b/diffkemp/llvm_ir/kernel_llvm_source_builder.py index cfdeb50ee..7ff1099cb 100644 --- a/diffkemp/llvm_ir/kernel_llvm_source_builder.py +++ b/diffkemp/llvm_ir/kernel_llvm_source_builder.py @@ -486,8 +486,12 @@ def _get_build_object(command): @staticmethod def _get_build_source(command): - """Get name of the object file built by the command.""" - return command[-1] + """ + Get name of the source file built by the command. + The commands start with 'clang -c -emit-llvm' to avoid linking + and the source file follows '-c' later in the command. + """ + return command[command.index("-c", 3) + 1] def _kbuild_object_command(self, object_file): """