diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c41d8b9..a5ece03 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,12 +3,27 @@ name: Build iPhone IPA on: push: branches: - - main + - "**" + paths: + - ".github/workflows/**" + - "ModelQuantizer/**" + - "ModelQuantizer.xcodeproj/**" + - "README.md" + - "TRIGGER.md" + pull_request: + branches: + - "**" + paths: + - ".github/workflows/**" + - "ModelQuantizer/**" + - "ModelQuantizer.xcodeproj/**" + - "README.md" + - "TRIGGER.md" workflow_dispatch: jobs: build: - runs-on: macos-15 + runs-on: macos-14 env: PROJECT: ModelQuantizer.xcodeproj @@ -25,7 +40,7 @@ jobs: - name: Select Xcode uses: maxim-lobanov/setup-xcode@v1 with: - xcode-version: '16.4' + xcode-version: '15.4' - name: Check Xcode and project shell: bash diff --git a/.github/workflows/ios.yml b/.github/workflows/ios.yml index 75fae34..da7b46c 100644 --- a/.github/workflows/ios.yml +++ b/.github/workflows/ios.yml @@ -2,9 +2,21 @@ name: iOS Build and Test on: push: - branches: [ main, develop ] + branches: [ "**" ] + paths: + - ".github/workflows/**" + - "ModelQuantizer/**" + - "ModelQuantizer.xcodeproj/**" + - "README.md" + - "TRIGGER.md" pull_request: - branches: [ main ] + branches: [ "**" ] + paths: + - ".github/workflows/**" + - "ModelQuantizer/**" + - "ModelQuantizer.xcodeproj/**" + - "README.md" + - "TRIGGER.md" jobs: build: @@ -15,8 +27,10 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Select Xcode Version - run: sudo xcode-select -s /Applications/Xcode_15.4.app + - name: Select Xcode + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.4' - name: Show Xcode Version run: xcodebuild -version @@ -35,12 +49,16 @@ jobs: - name: Run Tests run: | - xcodebuild test \ - -project ModelQuantizer.xcodeproj \ - -scheme ModelQuantizer \ - -destination 'platform=iOS Simulator,name=iPhone 15' \ - CODE_SIGNING_REQUIRED=NO \ - CODE_SIGNING_ALLOWED=NO + if xcodebuild -project ModelQuantizer.xcodeproj -list | grep -q "Tests"; then + xcodebuild test \ + -project ModelQuantizer.xcodeproj \ + -scheme ModelQuantizer \ + -destination 'platform=iOS Simulator,name=iPhone 15' \ + CODE_SIGNING_REQUIRED=NO \ + CODE_SIGNING_ALLOWED=NO + else + echo "No test target found; skipping test step." + fi - name: Archive Build Logs on Failure if: failure() @@ -59,8 +77,10 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Select Xcode Version - run: sudo xcode-select -s /Applications/Xcode_15.4.app + - name: Select Xcode + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.4' - name: Run Static Analysis run: | diff --git a/ModelQuantizer/Models/ModelTypes.swift b/ModelQuantizer/Models/ModelTypes.swift index 43d4dc8..9ae4ec9 100644 --- a/ModelQuantizer/Models/ModelTypes.swift +++ b/ModelQuantizer/Models/ModelTypes.swift @@ -31,7 +31,7 @@ struct HFModel: Identifiable, Codable, Equatable { architecture: ModelArchitecture, downloadURL: URL? = nil, sizeBytes: Int64 = 0, - quantizationOptions: [QuantizationType] = QuantizationType.allCases, + quantizationOptions: [QuantizationType] = QuantizationType.onDeviceSupportedCases, recommendedContextLength: Int = 4096, tags: [String] = [], downloads: Int = 0, @@ -67,13 +67,13 @@ enum ModelArchitecture: String, Codable, CaseIterable { var supportedQuantizations: [QuantizationType] { switch self { case .llama, .mistral, .qwen2, .gemma, .phi: - return [.q4_0, .q4_1, .q5_0, .q5_1, .q8_0, .fp16, .fp32] + return QuantizationType.onDeviceSupportedCases case .falcon, .gpt2: return [.q4_0, .q4_1, .q8_0, .fp16] case .bert: return [.q8_0, .fp16, .fp32] case .custom: - return QuantizationType.allCases + return QuantizationType.onDeviceSupportedCases } } @@ -148,6 +148,10 @@ enum QuantizationType: String, Codable, CaseIterable { return 32.0 / bits } + static var onDeviceSupportedCases: [QuantizationType] { + [.q4_0, .q4_1, .q8_0, .fp16, .fp32] + } + var ggufFileType: UInt32 { switch self { case .fp32: return 0 diff --git a/ModelQuantizer/Services/HuggingFaceAPI.swift b/ModelQuantizer/Services/HuggingFaceAPI.swift index 2a3dfe6..da6b19e 100644 --- a/ModelQuantizer/Services/HuggingFaceAPI.swift +++ b/ModelQuantizer/Services/HuggingFaceAPI.swift @@ -2,7 +2,7 @@ // HuggingFaceAPI.swift // ModelQuantizer // -// Real Hugging Face API integration for model search and download. +// Hugging Face API integration for model search and download. // import Foundation @@ -200,6 +200,9 @@ class HuggingFaceAPI: ObservableObject { // Remove existing file try? FileManager.default.removeItem(at: destination) + // Create destination file before opening file handle + _ = FileManager.default.createFile(atPath: destination.path, contents: nil) + // Write file let fileHandle = try FileHandle(forWritingTo: destination) defer { try? fileHandle.close() } diff --git a/ModelQuantizer/Services/ModelQuantizer.swift b/ModelQuantizer/Services/ModelQuantizer.swift deleted file mode 100644 index 89a83c4..0000000 --- a/ModelQuantizer/Services/ModelQuantizer.swift +++ /dev/null @@ -1,645 +0,0 @@ -// -// ModelQuantizer.swift -// ModelQuantizer -// -// Created by AI Assistant on 2026-03-31. -// - -import Foundation -import Metal -import MetalPerformanceShaders -import Accelerate -import Compression - -/// Represents a Hugging Face model to be quantized -struct HFModel: Identifiable, Codable, Equatable { - let id: UUID - let modelId: String - let name: String - let description: String - let parameters: String - let architecture: ModelArchitecture - let downloadURL: URL? - let sizeBytes: Int64 - let quantizationOptions: [QuantizationType] - let recommendedContextLength: Int - let tags: [String] - let downloads: Int - let likes: Int - - init(modelId: String, name: String, description: String, parameters: String, - architecture: ModelArchitecture, downloadURL: URL? = nil, sizeBytes: Int64 = 0, - quantizationOptions: [QuantizationType] = QuantizationType.allCases, - recommendedContextLength: Int = 4096, tags: [String] = [], downloads: Int = 0, likes: Int = 0) { - self.id = UUID() - self.modelId = modelId - self.name = name - self.description = description - self.parameters = parameters - self.architecture = architecture - self.downloadURL = downloadURL - self.sizeBytes = sizeBytes - self.quantizationOptions = quantizationOptions - self.recommendedContextLength = recommendedContextLength - self.tags = tags - self.downloads = downloads - self.likes = likes - } -} - -enum ModelArchitecture: String, Codable, CaseIterable { - case llama = "Llama" - case mistral = "Mistral" - case qwen2 = "Qwen2" - case gemma = "Gemma" - case phi = "Phi" - case falcon = "Falcon" - case gpt2 = "GPT-2" - case bert = "BERT" - case custom = "Custom" - - var supportedQuantizations: [QuantizationType] { - switch self { - case .llama, .mistral, .qwen2, .gemma, .phi: - return [.q4_0, .q4_1, .q5_0, .q5_1, .q8_0, .fp16, .fp32] - case .falcon, .gpt2: - return [.q4_0, .q4_1, .q8_0, .fp16] - case .bert: - return [.q8_0, .fp16, .fp32] - case .custom: - return QuantizationType.allCases - } - } -} - -enum QuantizationType: String, Codable, CaseIterable { - case q2_K = "Q2_K" - case q3_K_S = "Q3_K_S" - case q3_K_M = "Q3_K_M" - case q3_K_L = "Q3_K_L" - case q4_0 = "Q4_0" - case q4_1 = "Q4_1" - case q4_K_S = "Q4_K_S" - case q4_K_M = "Q4_K_M" - case q5_0 = "Q5_0" - case q5_1 = "Q5_1" - case q5_K_S = "Q5_K_S" - case q5_K_M = "Q5_K_M" - case q6_K = "Q6_K" - case q8_0 = "Q8_0" - case fp16 = "F16" - case fp32 = "F32" - - var bits: Double { - switch self { - case .q2_K: return 2.0 - case .q3_K_S, .q3_K_M, .q3_K_L: return 3.0 - case .q4_0, .q4_1, .q4_K_S, .q4_K_M: return 4.0 - case .q5_0, .q5_1, .q5_K_S, .q5_K_M: return 5.0 - case .q6_K: return 6.0 - case .q8_0: return 8.0 - case .fp16: return 16.0 - case .fp32: return 32.0 - } - } - - var description: String { - switch self { - case .q2_K: return "2-bit (Smallest, Lowest Quality)" - case .q3_K_S: return "3-bit Small (Aggressive compression)" - case .q3_K_M: return "3-bit Medium (Balanced)" - case .q3_K_L: return "3-bit Large (Better quality)" - case .q4_0: return "4-bit Legacy (Fast)" - case .q4_1: return "4-bit Legacy v2 (Better accuracy)" - case .q4_K_S: return "4-bit K-Quants Small (Recommended)" - case .q4_K_M: return "4-bit K-Quants Medium (Best 4-bit)" - case .q5_0: return "5-bit Legacy (Good balance)" - case .q5_1: return "5-bit Legacy v2 (Better)" - case .q5_K_S: return "5-bit K-Quants Small (High quality)" - case .q5_K_M: return "5-bit K-Quants Medium (Best 5-bit)" - case .q6_K: return "6-bit (Near FP16 quality)" - case .q8_0: return "8-bit (Excellent quality)" - case .fp16: return "16-bit Float (Original quality)" - case .fp32: return "32-bit Float (Maximum precision)" - } - } - - var compressionRatio: Double { - return 32.0 / bits - } -} - -/// Quantization progress and status -enum QuantizationStatus: Equatable { - case idle - case downloading(progress: Double) - case analyzing - case quantizing(progress: Double, stage: String) - case optimizing - case validating - case completed(outputURL: URL) - case failed(error: String) - - static func == (lhs: QuantizationStatus, rhs: QuantizationStatus) -> Bool { - switch (lhs, rhs) { - case (.idle, .idle): return true - case (.downloading(let p1), .downloading(let p2)): return p1 == p2 - case (.analyzing, .analyzing): return true - case (.quantizing(let p1, let s1), .quantizing(let p2, let s2)): return p1 == p2 && s1 == s2 - case (.optimizing, .optimizing): return true - case (.validating, .validating): return true - case (.completed(let u1), .completed(let u2)): return u1 == u2 - case (.failed(let e1), .failed(let e2)): return e1 == e2 - default: return false - } - } -} - -/// Main model quantizer engine -@MainActor -class ModelQuantizer: ObservableObject { - static let shared = ModelQuantizer() - - @Published var status: QuantizationStatus = .idle - @Published var currentModel: HFModel? - @Published var quantizationHistory: [QuantizationJob] = [] - - private var quantizeTask: Task? - private let fileManager = FileManager.default - private let metalDevice: MTLDevice? - - private var modelsDirectory: URL { - let docs = fileManager.urls(for: .documentDirectory, in: .userDomainMask).first! - return docs.appendingPathComponent("Models", isDirectory: true) - } - - private init() { - self.metalDevice = MTLCreateSystemDefaultDevice() - createModelsDirectory() - loadHistory() - } - - // MARK: - Public Methods - - func quantize(model: HFModel, to quantization: QuantizationType, - contextLength: Int? = nil, useGPU: Bool = true) { - guard status == .idle else { return } - - currentModel = model - quantizeTask?.cancel() - - quantizeTask = Task { [weak self] in - await self?.performQuantization(model: model, quantization: quantization, - contextLength: contextLength, useGPU: useGPU) - } - } - - func cancel() { - quantizeTask?.cancel() - status = .idle - } - - func getQuantizedModels() -> [QuantizedModel] { - guard let contents = try? fileManager.contentsOfDirectory(at: modelsDirectory, - includingPropertiesForKeys: nil) else { - return [] - } - - return contents.compactMap { url in - guard url.pathExtension == "gguf" else { return nil } - return try? QuantizedModel(from: url) - } - } - - func deleteQuantizedModel(_ model: QuantizedModel) { - try? fileManager.removeItem(at: model.url) - loadHistory() - } - - // MARK: - Private Methods - - private func createModelsDirectory() { - try? fileManager.createDirectory(at: modelsDirectory, withIntermediateDirectories: true) - } - - private func loadHistory() { - // Load from UserDefaults or local storage - if let data = UserDefaults.standard.data(forKey: "quantizationHistory"), - let history = try? JSONDecoder().decode([QuantizationJob].self, from: data) { - quantizationHistory = history - } - } - - private func saveHistory() { - if let data = try? JSONEncoder().encode(quantizationHistory) { - UserDefaults.standard.set(data, forKey: "quantizationHistory") - } - } - - private func performQuantization(model: HFModel, quantization: QuantizationType, - contextLength: Int?, useGPU: Bool) async { - let startTime = Date() - - do { - // Step 1: Download model if needed - let modelURL = try await downloadModel(model) - - // Step 2: Analyze model structure - status = .analyzing - let analysis = try await analyzeModel(at: modelURL) - - // Step 3: Perform quantization - let outputURL = modelsDirectory.appendingPathComponent("\(model.modelId)_\(quantization.rawValue).gguf") - - try await performActualQuantization( - inputURL: modelURL, - outputURL: outputURL, - analysis: analysis, - quantization: quantization, - contextLength: contextLength ?? model.recommendedContextLength, - useGPU: useGPU - ) - - // Step 4: Validate output - status = .validating - try await validateQuantizedModel(at: outputURL) - - // Complete - let job = QuantizationJob( - id: UUID(), - originalModel: model, - quantizationType: quantization, - outputURL: outputURL, - outputSize: (try? fileManager.attributesOfItem(atPath: outputURL.path)[.size] as? Int64) ?? 0, - startTime: startTime, - endTime: Date(), - contextLength: contextLength ?? model.recommendedContextLength - ) - - quantizationHistory.insert(job, at: 0) - saveHistory() - - status = .completed(outputURL: outputURL) - - } catch { - status = .failed(error: error.localizedDescription) - } - } - - private func downloadModel(_ model: HFModel) async throws -> URL { - guard let downloadURL = model.downloadURL else { - throw QuantizationError.noDownloadURL - } - - let destination = modelsDirectory.appendingPathComponent("\(model.modelId).tmp") - - // Check if already downloaded - if fileManager.fileExists(atPath: destination.path) { - let attrs = try fileManager.attributesOfItem(atPath: destination.path) - if let size = attrs[.size] as? Int64, size == model.sizeBytes { - return destination - } - } - - // Download with progress - let session = URLSession(configuration: .default) - - let (asyncBytes, response) = try await session.bytes(from: downloadURL) - let totalBytes = response.expectedContentLength - var downloadedBytes: Int64 = 0 - var lastProgress: Double = 0 - - var fileHandle = try FileHandle(forWritingTo: destination) - defer { try? fileHandle.close() } - - for try await byte in asyncBytes { - fileHandle.write(Data([byte])) - downloadedBytes += 1 - - if totalBytes > 0 { - let currentProgress = Double(downloadedBytes) / Double(totalBytes) - if currentProgress - lastProgress > 0.01 { - lastProgress = currentProgress - await MainActor.run { - self.status = .downloading(progress: currentProgress) - } - } - } - } - - return destination - } - - private func analyzeModel(at url: URL) async throws -> ModelAnalysis { - // Read model file and analyze structure - let data = try Data(contentsOf: url, options: .mappedIfSafe) - - // Detect architecture and structure - var architecture: ModelArchitecture = .custom - var layerCount = 0 - var tensorCount = 0 - var totalParameters: Int64 = 0 - - // Parse based on file format (safetensors, bin, etc.) - if url.pathExtension == "safetensors" { - // Parse safetensors format - let analysis = try parseSafeTensors(data) - architecture = analysis.architecture - layerCount = analysis.layerCount - tensorCount = analysis.tensorCount - totalParameters = analysis.totalParameters - } else if url.pathExtension == "bin" { - // Parse PyTorch bin format - let analysis = try parsePyTorchBin(data) - architecture = analysis.architecture - layerCount = analysis.layerCount - tensorCount = analysis.tensorCount - totalParameters = analysis.totalParameters - } - - return ModelAnalysis( - architecture: architecture, - layerCount: layerCount, - tensorCount: tensorCount, - totalParameters: totalParameters, - originalSize: Int64(data.count) - ) - } - - private func parseSafeTensors(_ data: Data) throws -> ModelAnalysis { - // SafeTensors format parsing - // Header is JSON, followed by tensor data - var architecture: ModelArchitecture = .custom - var layerCount = 0 - var tensorCount = 0 - var totalParameters: Int64 = 0 - - // Read header length (first 8 bytes, little-endian uint64) - let headerLength = data.prefix(8).withUnsafeBytes { $0.load(as: UInt64.self) } - - // Parse header JSON - let headerData = data.dropFirst(8).prefix(Int(headerLength)) - if let header = try? JSONSerialization.jsonObject(with: headerData) as? [String: Any] { - - // Detect architecture from tensor names - let tensorNames = header.keys - if tensorNames.contains(where: { $0.contains("llama") || $0.contains("self_attn") }) { - architecture = .llama - } else if tensorNames.contains(where: { $0.contains("mistral") }) { - architecture = .mistral - } else if tensorNames.contains(where: { $0.contains("qwen") }) { - architecture = .qwen2 - } else if tensorNames.contains(where: { $0.contains("gemma") }) { - architecture = .gemma - } - - // Count tensors and parameters - for (key, value) in header { - if let tensorInfo = value as? [String: Any], - let shape = tensorInfo["shape"] as? [Int] { - tensorCount += 1 - let paramCount = shape.reduce(1, *) - totalParameters += Int64(paramCount) - - if key.contains("layers.") { - layerCount = max(layerCount, Int(key.components(separatedBy: "layers.").last?.components(separatedBy: ".").first ?? "0") ?? 0) - } - } - } - } - - return ModelAnalysis( - architecture: architecture, - layerCount: layerCount, - tensorCount: tensorCount, - totalParameters: totalParameters, - originalSize: Int64(data.count) - ) - } - - private func parsePyTorchBin(_ data: Data) throws -> ModelAnalysis { - // PyTorch pickle format parsing (simplified) - // This would need a proper pickle parser for full support - return ModelAnalysis( - architecture: .custom, - layerCount: 0, - tensorCount: 0, - totalParameters: 0, - originalSize: Int64(data.count) - ) - } - - private func performActualQuantization(inputURL: URL, outputURL: URL, - analysis: ModelAnalysis, quantization: QuantizationType, - contextLength: Int, useGPU: Bool) async throws { - - let stages = ["Loading tensors", "Quantizing weights", "Building GGUF", "Writing output"] - let totalStages = stages.count - - for (index, stage) in stages.enumerated() { - try Task.checkCancellation() - - let progress = Double(index) / Double(totalStages) - status = .quantizing(progress: progress, stage: stage) - - // Simulate work (in real implementation, this would be actual quantization) - try await Task.sleep(nanoseconds: 500_000_000) - - // Actual quantization would happen here - if index == 1 { - try await quantizeTensors(inputURL: inputURL, outputURL: outputURL, - analysis: analysis, quantization: quantization) - } - } - - status = .quantizing(progress: 1.0, stage: "Complete") - } - - private func quantizeTensors(inputURL: URL, outputURL: URL, - analysis: ModelAnalysis, quantization: QuantizationType) async throws { - - // Create GGUF file structure - var ggufBuilder = GGUFBuilder() - - // Add metadata - ggufBuilder.addMetadata(key: "general.architecture", value: .string(analysis.architecture.rawValue.lowercased())) - ggufBuilder.addMetadata(key: "general.name", value: .string(currentModel?.name ?? "Unknown")) - ggufBuilder.addMetadata(key: "general.quantization_version", value: .uint32(2)) - - // Add tensor info - // This would read actual tensors and quantize them - - // Write GGUF file - let ggufData = try ggufBuilder.build() - try ggufData.write(to: outputURL) - } - - private func validateQuantizedModel(at url: URL) async throws { - // Verify the quantized model is valid - let data = try Data(contentsOf: url, options: .mappedIfSafe) - - // Check GGUF magic number - let magic = data.prefix(4) - guard magic == Data("GGUF".utf8) else { - throw QuantizationError.invalidOutput - } - - // Additional validation would go here - } -} - -// MARK: - Supporting Types - -struct ModelAnalysis { - let architecture: ModelArchitecture - let layerCount: Int - let tensorCount: Int - let totalParameters: Int64 - let originalSize: Int64 -} - -struct QuantizationJob: Codable, Identifiable { - let id: UUID - let originalModel: HFModel - let quantizationType: QuantizationType - let outputURL: URL - let outputSize: Int64 - let startTime: Date - let endTime: Date - let contextLength: Int - - var duration: TimeInterval { - return endTime.timeIntervalSince(startTime) - } - - var compressionRatio: Double { - return Double(originalModel.sizeBytes) / Double(outputSize) - } -} - -struct QuantizedModel: Identifiable { - let id = UUID() - let url: URL - let name: String - let size: Int64 - let quantization: QuantizationType - let createdDate: Date - - init?(from url: URL) throws { - self.url = url - self.name = url.deletingPathExtension().lastPathComponent - - let attrs = try FileManager.default.attributesOfItem(atPath: url.path) - self.size = attrs[.size] as? Int64 ?? 0 - self.createdDate = attrs[.creationDate] as? Date ?? Date() - - // Detect quantization from filename - let filename = url.lastPathComponent.lowercased() - if let qType = QuantizationType.allCases.first(where: { filename.contains($0.rawValue.lowercased()) }) { - self.quantization = qType - } else { - self.quantization = .q4_0 - } - } -} - -enum QuantizationError: Error, LocalizedError { - case noDownloadURL - case downloadFailed - case invalidModelFormat - case quantizationFailed - case invalidOutput - case insufficientMemory - case cancelled - - var errorDescription: String? { - switch self { - case .noDownloadURL: return "No download URL provided for model" - case .downloadFailed: return "Failed to download model" - case .invalidModelFormat: return "Unsupported model format" - case .quantizationFailed: return "Quantization process failed" - case .invalidOutput: return "Generated model is invalid" - case .insufficientMemory: return "Insufficient memory for quantization" - case .cancelled: return "Quantization cancelled" - } - } -} - -// MARK: - Integer to Data Extension - -extension FixedWidthInteger { - var littleEndianData: Data { - var value = self.littleEndian - return withUnsafeBytes(of: &value) { Data($0) } - } -} - -// MARK: - GGUF Builder - -struct GGUFBuilder { - enum MetadataValue { - case uint32(UInt32) - case uint64(UInt64) - case int32(Int32) - case int64(Int64) - case float32(Float) - case float64(Double) - case bool(Bool) - case string(String) - case array([MetadataValue]) - } - - private var metadata: [(String, MetadataValue)] = [] - private var tensors: [(name: String, shape: [Int], data: Data)] = [] - - mutating func addMetadata(key: String, value: MetadataValue) { - metadata.append((key, value)) - } - - mutating func addTensor(name: String, shape: [Int], data: Data) { - tensors.append((name, shape, data)) - } - - func build() throws -> Data { - var data = Data() - - // Magic number - data.append(Data("GGUF".utf8)) - - // Version - data.append(UInt32(3).littleEndianData) - - // Tensor count - data.append(UInt64(tensors.count).littleEndianData) - - // Metadata count - data.append(UInt64(metadata.count).littleEndianData) - - // Metadata - for (key, value) in metadata { - // Key length and string - data.append(UInt64(key.utf8.count).littleEndianData) - data.append(Data(key.utf8)) - - // Value type and data - switch value { - case .uint32(let v): - data.append(UInt32(4).littleEndianData) // type - data.append(v.littleEndianData) - case .uint64(let v): - data.append(UInt32(5).littleEndianData) - data.append(v.littleEndianData) - case .string(let s): - data.append(UInt32(8).littleEndianData) - data.append(UInt64(s.utf8.count).littleEndianData) - data.append(Data(s.utf8)) - default: - break - } - } - - // Tensor info and data would follow - - return data - } -} diff --git a/ModelQuantizer/Services/QuantizationEngine.swift b/ModelQuantizer/Services/QuantizationEngine.swift index fe801d1..a7d762d 100644 --- a/ModelQuantizer/Services/QuantizationEngine.swift +++ b/ModelQuantizer/Services/QuantizationEngine.swift @@ -2,7 +2,7 @@ // QuantizationEngine.swift // ModelQuantizer // -// Real ML model quantization engine using llama.cpp +// Experimental on-device quantization engine. // import Foundation @@ -10,7 +10,7 @@ import Accelerate import Metal import MetalPerformanceShaders -/// Real quantization engine that performs actual model quantization +/// Experimental quantization engine for GGUF conversion/quantization prototypes. @MainActor class QuantizationEngine: ObservableObject { static let shared = QuantizationEngine() @@ -263,7 +263,7 @@ class QuantizationEngine: ObservableObject { // Read header length (first 8 bytes, little-endian uint64) let headerLength = data.prefix(8).withUnsafeBytes { ptr -> UInt64 in - ptr.load(as: UInt64.self) + UInt64(littleEndian: ptr.loadUnaligned(as: UInt64.self)) } guard headerLength > 0 && headerLength < UInt64(data.count) else { @@ -365,29 +365,33 @@ class QuantizationEngine: ObservableObject { private func processSafeTensorsFile(_ url: URL, into builder: inout GGUFBuilder) async throws { let data = try Data(contentsOf: url, options: .mappedIfSafe) + guard data.count >= 8 else { throw QuantizationError.invalidModelFormat } // Read header - let headerLength = data.prefix(8).withUnsafeBytes { $0.load(as: UInt64.self) } + let headerLength = data.prefix(8).withUnsafeBytes { UInt64(littleEndian: $0.loadUnaligned(as: UInt64.self)) } let headerData = data.dropFirst(8).prefix(Int(headerLength)) guard let header = try JSONSerialization.jsonObject(with: headerData) as? [String: Any] else { return } - var offset = 8 + Int(headerLength) - for (key, value) in header { try Task.checkCancellation() + if key == "__metadata__" { continue } + guard let tensorInfo = value as? [String: Any], let shape = tensorInfo["shape"] as? [Int], - let dtype = tensorInfo["dtype"] as? String else { continue } + let dtype = tensorInfo["dtype"] as? String, + let dataOffsets = tensorInfo["data_offsets"] as? [Int], + dataOffsets.count == 2 else { continue } + + let dataSectionOffset = 8 + Int(headerLength) + let tensorStart = dataSectionOffset + dataOffsets[0] + let tensorEnd = dataSectionOffset + dataOffsets[1] - // Calculate tensor size - let numElements = shape.reduce(1, *) - let elementSize = dtypeSize(for: dtype) - let tensorSize = numElements * elementSize + guard tensorStart >= 0, tensorEnd <= data.count, tensorStart < tensorEnd else { continue } // Read tensor data - let tensorData = data.subdata(in: offset..<(offset + tensorSize)) + let tensorData = data.subdata(in: tensorStart.. Int { + private func ggmlType(for dtype: String) -> GGMLType { switch dtype { - case "F32", "float32": return 4 - case "F16", "float16": return 2 - case "BF16", "bfloat16": return 2 - case "I32", "int32": return 4 - case "I16", "int16": return 2 - case "I8", "int8": return 1 - case "U8", "uint8": return 1 - case "BOOL": return 1 - default: return 4 + case "F16", "float16", "BF16", "bfloat16": + return .float16 + default: + return .float32 } } private func convertTensorName(_ name: String) -> String { // Convert Hugging Face tensor names to GGUF format - var converted = name + let converted = name .replacingOccurrences(of: "model.embed_tokens.", with: "token_embd.") .replacingOccurrences(of: "model.norm.", with: "output_norm.") .replacingOccurrences(of: "lm_head.", with: "output.") @@ -522,10 +514,6 @@ class QuantizationEngine: ObservableObject { return try quantizeToQ4_0(tensor) case .q4_1: return try quantizeToQ4_1(tensor) - case .q5_0: - return try quantizeToQ5_0(tensor) - case .q5_1: - return try quantizeToQ5_1(tensor) case .q8_0: return try quantizeToQ8_0(tensor) case .fp16: @@ -533,29 +521,22 @@ class QuantizationEngine: ObservableObject { case .fp32: return tensor default: - // Default to Q4_0 for K-quants - return try quantizeToQ4_0(tensor) + throw QuantizationError.unsupportedQuantization(type: quantization.rawValue) } } // Q4_0 quantization: 4-bit with block-wise scaling private func quantizeToQ4_0(_ tensor: GGUFTensor) throws -> GGUFTensor { let blockSize = 32 - let numElements = tensor.data.count / MemoryLayout.size + let floatData = try tensorFloatValues(from: tensor) + let numElements = floatData.count let numBlocks = (numElements + blockSize - 1) / blockSize var outputData = Data() - // Read float data - let floatData = tensor.data.withUnsafeBytes { ptr -> [Float] in - Array(ptr.bindMemory(to: Float.self)) - } - for blockIdx in 0.. GGUFTensor { let blockSize = 32 - let numElements = tensor.data.count / MemoryLayout.size + let floatData = try tensorFloatValues(from: tensor) + let numElements = floatData.count let numBlocks = (numElements + blockSize - 1) / blockSize var outputData = Data() - let floatData = tensor.data.withUnsafeBytes { Array($0.bindMemory(to: Float.self)) } for blockIdx in 0.. GGUFTensor { - // Similar to Q4_0 but with 5-bit precision - // Implementation would follow similar pattern with 32-element blocks - // For brevity, using Q4_0 as fallback - return try quantizeToQ4_0(tensor) - } - - // Q5_1 quantization - private func quantizeToQ5_1(_ tensor: GGUFTensor) throws -> GGUFTensor { - // Similar to Q4_1 but with 5-bit precision - return try quantizeToQ4_1(tensor) - } - // Q8_0 quantization: 8-bit with block-wise scaling private func quantizeToQ8_0(_ tensor: GGUFTensor) throws -> GGUFTensor { let blockSize = 32 - let numElements = tensor.data.count / MemoryLayout.size + let floatData = try tensorFloatValues(from: tensor) + let numElements = floatData.count let numBlocks = (numElements + blockSize - 1) / blockSize var outputData = Data() - let floatData = tensor.data.withUnsafeBytes { Array($0.bindMemory(to: Float.self)) } for blockIdx in 0.. GGUFTensor { - let floatData = tensor.data.withUnsafeBytes { Array($0.bindMemory(to: Float.self)) } + let floatData = try tensorFloatValues(from: tensor) var outputData = Data() for value in floatData { @@ -715,12 +682,31 @@ class QuantizationEngine: ObservableObject { return GGUFTensor(name: tensor.name, shape: tensor.shape, dataType: .float16, data: outputData) } + private func tensorFloatValues(from tensor: GGUFTensor) throws -> [Float] { + switch tensor.dataType { + case .float32: + guard tensor.data.count.isMultiple(of: MemoryLayout.size) else { + throw QuantizationError.invalidModelFormat + } + return tensor.data.withUnsafeBytes { Array($0.bindMemory(to: Float.self)) } + case .float16: + guard tensor.data.count.isMultiple(of: MemoryLayout.size) else { + throw QuantizationError.invalidModelFormat + } + let words = tensor.data.withUnsafeBytes { Array($0.bindMemory(to: UInt16.self)) } + return words.map { Float16(bits: $0).floatValue } + default: + throw QuantizationError.invalidModelFormat + } + } + // MARK: - Step 5: Validate private func validateQuantizedModel(at url: URL, originalModel: HFModel) async throws { await updateStatus(.validating, stage: "Validating output...") let data = try Data(contentsOf: url, options: .mappedIfSafe) + guard data.count >= 8 else { throw QuantizationError.invalidOutput } // Check GGUF magic number let magic = data.prefix(4) @@ -729,7 +715,7 @@ class QuantizationEngine: ObservableObject { } // Verify version - let version = data.dropFirst(4).prefix(4).withUnsafeBytes { $0.load(as: UInt32.self) } + let version = data.dropFirst(4).prefix(4).withUnsafeBytes { UInt32(littleEndian: $0.loadUnaligned(as: UInt32.self)) } guard version == 3 else { throw QuantizationError.invalidOutput } @@ -889,25 +875,31 @@ public struct GGUFParser { } internal mutating func readData(count: Int) -> Data { + guard count >= 0, offset >= 0, offset + count <= data.count else { + return Data() + } let data = self.data.subdata(in: offset..<(offset + count)) offset += count return data } private mutating func readUInt32() -> UInt32 { - let value = data.subdata(in: offset..<(offset + 4)).withUnsafeBytes { $0.load(as: UInt32.self) } - offset += 4 - return value + let bytes = readData(count: 4) + guard bytes.count == 4 else { return 0 } + let value = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt32.self) } + return UInt32(littleEndian: value) } private mutating func readUInt64() -> UInt64 { - let value = data.subdata(in: offset..<(offset + 8)).withUnsafeBytes { $0.load(as: UInt64.self) } - offset += 8 - return value + let bytes = readData(count: 8) + guard bytes.count == 8 else { return 0 } + let value = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt64.self) } + return UInt64(littleEndian: value) } private mutating func readString() -> String { let length = Int(readUInt64()) + guard length >= 0, offset >= 0, offset + length <= data.count else { return "" } let stringData = data.subdata(in: offset..<(offset + length)) offset += length return String(data: stringData, encoding: .utf8) ?? "" @@ -922,25 +914,34 @@ public struct GGUFParser { case 1: // INT8 return .int8(Int8(bitPattern: readData(count: 1).first ?? 0)) case 2: // UINT16 - return .uint16(readData(count: 2).withUnsafeBytes { $0.load(as: UInt16.self) }) + let bytes = readData(count: 2) + guard bytes.count == 2 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt16.self) } + return .uint16(UInt16(littleEndian: raw)) case 3: // INT16 - return .int16(readData(count: 2).withUnsafeBytes { $0.load(as: Int16.self) }) + let bytes = readData(count: 2) + guard bytes.count == 2 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt16.self) } + return .int16(Int16(bitPattern: UInt16(littleEndian: raw))) case 4: // UINT32 return .uint32(readUInt32()) case 5: // INT32 return .int32(Int32(bitPattern: readUInt32())) case 6: // FLOAT32 - return .float32(readData(count: 4).withUnsafeBytes { $0.load(as: Float.self) }) + let bytes = readData(count: 4) + guard bytes.count == 4 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt32.self) } + return .float32(Float(bitPattern: UInt32(littleEndian: raw))) case 7: // BOOL return .bool(readData(count: 1).first != 0) case 8: // STRING return .string(readString()) case 9: // ARRAY - let _ = readUInt32() // element type + let elementType = readUInt32() let count = readUInt64() var array: [GGUFBuilder.MetadataValue] = [] for _ in 0.. GGUFBuilder.MetadataValue { + switch type { + case 0: return .uint8(readData(count: 1).first ?? 0) + case 1: return .int8(Int8(bitPattern: readData(count: 1).first ?? 0)) + case 2: + let bytes = readData(count: 2) + guard bytes.count == 2 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt16.self) } + return .uint16(UInt16(littleEndian: raw)) + case 3: + let bytes = readData(count: 2) + guard bytes.count == 2 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt16.self) } + return .int16(Int16(bitPattern: UInt16(littleEndian: raw))) + case 4: return .uint32(readUInt32()) + case 5: return .int32(Int32(bitPattern: readUInt32())) + case 6: + let bytes = readData(count: 4) + guard bytes.count == 4 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt32.self) } + return .float32(Float(bitPattern: UInt32(littleEndian: raw))) + case 7: return .bool(readData(count: 1).first != 0) + case 8: return .string(readString()) + case 10: return .uint64(readUInt64()) + case 11: return .int64(Int64(bitPattern: readUInt64())) + case 12: + let bytes = readData(count: 8) + guard bytes.count == 8 else { throw QuantizationError.invalidModelFormat } + let raw = bytes.withUnsafeBytes { $0.loadUnaligned(as: UInt64.self) } + return .float64(Double(bitPattern: UInt64(littleEndian: raw))) default: throw QuantizationError.invalidModelFormat } @@ -971,8 +1010,17 @@ public struct GGUFParser { case .q8_0: elementSize = 34 // 2 bytes scale + 32 bytes data per 32 elements } - let tensorSize = Int(numElements) * elementSize / 32 // Adjust for block sizes - let tensorData = readData(count: max(tensorSize, Int(numElements) * 4)) // Fallback to 4 bytes per element + let tensorSize: Int + switch info.type { + case .float32, .float16: + tensorSize = Int(numElements) * elementSize + default: + tensorSize = ((Int(numElements) + 31) / 32) * elementSize // Block quantized formats + } + let tensorData = readData(count: tensorSize) + guard tensorData.count == tensorSize else { + throw QuantizationError.invalidModelFormat + } return GGUFTensor( name: info.name, @@ -1002,8 +1050,39 @@ struct Float16: Equatable { } private func floatToHalf(_ value: Float) -> UInt16 { - // Simplified for CI validation speed - just return zero - return 0 + let bits = value.bitPattern + let sign = UInt16((bits >> 16) & 0x8000) + var exponent = Int((bits >> 23) & 0xFF) - 127 + 15 + var mantissa = bits & 0x007F_FFFF + + if exponent <= 0 { + if exponent < -10 { return sign } + mantissa |= 0x0080_0000 + let shift = UInt32(14 - exponent) + var halfMantissa = UInt16(mantissa >> shift) + if ((mantissa >> (shift - 1)) & 1) == 1 { + halfMantissa &+= 1 + } + return sign | halfMantissa + } + + if exponent >= 31 { + return sign | 0x7C00 + } + + var halfMantissa = UInt16(mantissa >> 13) + if ((mantissa >> 12) & 1) == 1 { + halfMantissa &+= 1 + if halfMantissa == 0x0400 { + halfMantissa = 0 + exponent += 1 + if exponent >= 31 { + return sign | 0x7C00 + } + } + } + + return sign | UInt16(exponent << 10) | halfMantissa } private func halfToFloat(_ bits: UInt16) -> Float { @@ -1072,6 +1151,7 @@ enum QuantizationError: Error, LocalizedError { case invalidOutput case insufficientMemory case cancelled + case unsupportedQuantization(type: String) var errorDescription: String? { switch self { @@ -1091,6 +1171,8 @@ enum QuantizationError: Error, LocalizedError { return "Insufficient memory for quantization" case .cancelled: return "Quantization was cancelled" + case .unsupportedQuantization(let type): + return "Quantization type \(type) is not supported in this build" } } } diff --git a/ModelQuantizer/ViewModels/QuantizeViewModel.swift b/ModelQuantizer/ViewModels/QuantizeViewModel.swift index 6cf0fac..f63d1b7 100644 --- a/ModelQuantizer/ViewModels/QuantizeViewModel.swift +++ b/ModelQuantizer/ViewModels/QuantizeViewModel.swift @@ -10,12 +10,11 @@ import Combine @MainActor class QuantizeViewModel: ObservableObject { - func filterModels(query: String) {} // Placeholder to fix compiler error @Published var searchQuery = "" @Published var models: [HFModel] = [] @Published var filteredModels: [HFModel] = [] @Published var selectedModel: HFModel? - @Published var selectedQuantization: QuantizationType = .q4_K_M + @Published var selectedQuantization: QuantizationType = .q4_1 @Published var customContextLength: Int = 4096 @Published var isSearching = false @Published var errorMessage: String? @@ -211,7 +210,7 @@ class QuantizeViewModel: ObservableObject { let existingIds = Set(self.models.map { $0.modelId }) let newModels = popularModels.filter { !existingIds.contains($0.modelId) } self.models.append(contentsOf: newModels) - self.filterModels(query: self.searchQuery) + self.filterLocalModels(query: self.searchQuery) } } catch { // Silently fail - we already have fallback models @@ -295,6 +294,12 @@ class QuantizeViewModel: ObservableObject { func startQuantization() { guard let model = selectedModel else { return } + guard model.architecture.supportedQuantizations.contains(selectedQuantization) else { + errorMessage = "\(selectedQuantization.rawValue) is not supported for \(model.architecture.rawValue) in this build." + showError = true + return + } + // Check if model requires authentication if model.modelId.hasPrefix("meta-llama/") && HuggingFaceAPI.shared.getAuthToken() == nil { errorMessage = "This model requires Hugging Face authentication. Please add your token in Settings." @@ -354,14 +359,12 @@ class QuantizeViewModel: ObservableObject { private func quantizationTypeFromBits(_ bits: Int) -> QuantizationType { switch bits { - case 2: return .q2_K - case 3: return .q3_K_M - case 4: return .q4_K_M - case 5: return .q5_K_M - case 6: return .q6_K + case 4: return .q4_1 + case 5: return .q8_0 + case 6: return .q8_0 case 8: return .q8_0 case 16: return .fp16 - default: return .q4_K_M + default: return .q4_1 } } } diff --git a/ModelQuantizer/Views/QuantizeView.swift b/ModelQuantizer/Views/QuantizeView.swift index edba13e..27658ae 100644 --- a/ModelQuantizer/Views/QuantizeView.swift +++ b/ModelQuantizer/Views/QuantizeView.swift @@ -74,6 +74,10 @@ struct QuantizeView: View { Text("Search Hugging Face and quantize models") .font(.system(size: 16, weight: .medium)) .foregroundStyle(.white.opacity(0.7)) + + Text("Experimental: output quality/compatibility may vary by model.") + .font(.system(size: 12, weight: .medium)) + .foregroundStyle(.orange.opacity(0.9)) } .frame(maxWidth: .infinity, alignment: .leading) } diff --git a/README.md b/README.md index c593e0a..2984c11 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # ModelQuantizer -A fully functional iOS app for quantizing Hugging Face AI models directly on your device. Built with SwiftUI and featuring real ML quantization capabilities. +An experimental iOS app for downloading compatible Hugging Face model files and converting/quantizing them to GGUF on-device. Built with SwiftUI. ![App Icon](ModelQuantizer/Resources/Assets.xcassets/AppIcon.appiconset/appicon-1024.png) ## Features -### Real ML Quantization -- **Actual Quantization**: Converts models to GGUF format with Q2_K through FP32 quantization types +### On-Device Quantization (Current Build) +- **Implemented quantizers**: Q4_0, Q4_1, Q8_0, FP16, FP32 - **Hugging Face Integration**: Search and download models directly from Hugging Face Hub - **Architecture Support**: Llama, Mistral, Qwen2, Gemma, Phi, Falcon, GPT-2, BERT - **Real Progress**: Live progress tracking during download, analysis, and quantization @@ -30,6 +30,13 @@ A fully functional iOS app for quantizing Hugging Face AI models directly on you - **Detailed Info**: View parameters, downloads, likes, and supported quantizations - **One-Tap Quantize**: Start quantization directly from model details +## Current Limitations (Important) + +- This project is **experimental** and does not yet implement full llama.cpp parity. +- Quantization quality and output compatibility can vary by architecture/model checkpoint. +- Only the quantizers listed in this README are implemented in this build. +- Verify generated GGUF files in your target runtime before production use. + ## Requirements - iOS 18.0+ @@ -93,25 +100,23 @@ Some models (like Llama) require authentication: | Type | Bits | Compression | Quality | Use Case | |------|------|-------------|---------|----------| -| Q2_K | 2 | 16× | Low | Entry-level devices | -| Q3_K_M | 3 | 10.7× | Fair | Limited RAM | -| Q4_K_M | 4 | 8× | Good | Balanced (Recommended) | -| Q5_K_M | 5 | 6.4× | Very Good | High-end devices | -| Q6_K | 6 | 5.3× | Excellent | Premium devices | +| Q4_0 | 4 | 8× | Good | Fast 4-bit | +| Q4_1 | 4 | 8× | Better | Better 4-bit accuracy | | Q8_0 | 8 | 4× | Near-Perfect | Maximum quality | | FP16 | 16 | 2× | Original | Research/development | +| FP32 | 32 | 1× | Original | Baseline/uncompressed | ## Device Compatibility ### Ultra (iPhone 16 Pro/Max, iPad Pro M4) - **Max Model Size**: 24GB -- **Recommended**: Q5-Q6 quantization +- **Recommended**: Q8_0 quantization - **Context**: Up to 32K tokens - **Features**: Full Neural Engine, all GPU layers ### Flagship (iPhone 16/15 Pro) - **Max Model Size**: 12GB -- **Recommended**: Q4-Q5 quantization +- **Recommended**: Q4_1 to Q8_0 quantization - **Context**: Up to 16K tokens - **Features**: Neural Engine, most GPU layers @@ -123,13 +128,13 @@ Some models (like Llama) require authentication: ### Mid-Range (iPhone 12/11) - **Max Model Size**: 4GB -- **Recommended**: Q3-Q4 quantization +- **Recommended**: Q4_0 quantization - **Context**: Up to 4K tokens - **Features**: Limited GPU ### Entry-Level - **Max Model Size**: 2GB -- **Recommended**: Q2-Q3 quantization +- **Recommended**: Q4_0 quantization - **Context**: Up to 2K tokens - **Features**: CPU only @@ -146,7 +151,7 @@ Some models (like Llama) require authentication: - Real tensor analysis and quantization - Memory-mapped file I/O - Progressive quantization with checkpointing -- Support for Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, FP16, FP32 +- Support for Q4_0, Q4_1, Q8_0, FP16, FP32 ### Performance - Background processing with progress callbacks