import unittest, os, strutils, tables, sets import ../src/nimpak/cas import ../src/nip/types suite "Cross-Format Deduplication Metrics Tests": var cas: CasManager testRoot = getTempDir() / "nip_dedup_test_" & $getCurrentProcessId() # Test data chunk1 = @[1.byte, 2.byte, 3.byte] # 3 bytes chunk2 = @[4.byte, 5.byte, 6.byte] # 3 bytes chunk3 = @[7.byte, 8.byte, 9.byte] # 3 bytes hash1: string hash2: string hash3: string setup: createDir(testRoot) cas = initCasManager(testRoot) # Store chunks let res1 = cas.storeObject(chunk1) let res2 = cas.storeObject(chunk2) let res3 = cas.storeObject(chunk3) hash1 = res1.get().hash hash2 = res2.get().hash hash3 = res3.get().hash teardown: removeDir(testRoot) test "Basic Deduplication Stats": # Scenario: # NPK uses chunk1, chunk2 # NIP uses chunk2, chunk3 # NEXTER uses chunk1, chunk3 # chunk1: NPK, NEXTER (Ref count 2) # chunk2: NPK, NIP (Ref count 2) # chunk3: NIP, NEXTER (Ref count 2) discard cas.addReference(hash1, NPK, "pkg1") discard cas.addReference(hash2, NPK, "pkg1") discard cas.addReference(hash2, NIP, "pkg2") discard cas.addReference(hash3, NIP, "pkg2") discard cas.addReference(hash1, NEXTER, "pkg3") discard cas.addReference(hash3, NEXTER, "pkg3") let statsResult = cas.getDeduplicationStats() check statsResult.isOk let stats = statsResult.get() # Physical size: 3 chunks * 3 bytes = 9 bytes check stats.totalPhysicalSize == 9 # Logical size: # pkg1: 3+3 = 6 # pkg2: 3+3 = 6 # pkg3: 3+3 = 6 # Total: 18 bytes check stats.totalLogicalSize == 18 # Deduplication ratio: 18 / 9 = 2.0 check stats.deduplicationRatio == 2.0 # Shared chunks: All 3 are shared check stats.sharedChunks == 3 # Savings: 18 - 9 = 9 bytes check stats.savings == 9 # Format Overlap # chunk1: NPK-NEXTER # chunk2: NIP-NPK # chunk3: NEXTER-NIP check stats.formatOverlap.hasKey("NEXTER-NPK") check stats.formatOverlap["NEXTER-NPK"] == 1 check stats.formatOverlap.hasKey("NIP-NPK") check stats.formatOverlap["NIP-NPK"] == 1 check stats.formatOverlap.hasKey("NEXTER-NIP") check stats.formatOverlap["NEXTER-NIP"] == 1 test "No Deduplication": # Scenario: Unique chunks for each discard cas.addReference(hash1, NPK, "pkg1") discard cas.addReference(hash2, NIP, "pkg2") discard cas.addReference(hash3, NEXTER, "pkg3") let statsResult = cas.getDeduplicationStats() check statsResult.isOk let stats = statsResult.get() check stats.totalPhysicalSize == 9 check stats.totalLogicalSize == 9 check stats.deduplicationRatio == 1.0 check stats.sharedChunks == 0 check stats.savings == 0 check stats.formatOverlap.len == 0 test "High Redundancy": # Scenario: All formats use same chunk discard cas.addReference(hash1, NPK, "pkg1") discard cas.addReference(hash1, NIP, "pkg2") discard cas.addReference(hash1, NEXTER, "pkg3") let statsResult = cas.getDeduplicationStats() check statsResult.isOk let stats = statsResult.get() # Physical: 3 bytes (only chunk1 counted, others ignored if not referenced? No, we only iterate referenced hashes) # Wait, hash2 and hash3 exist in CAS but are NOT referenced. # getDeduplicationStats iterates over `cas.formatRefs`. # So unreferenced chunks are NOT included in stats. check stats.totalPhysicalSize == 3 check stats.totalLogicalSize == 9 # 3 refs * 3 bytes check stats.deduplicationRatio == 3.0 check stats.sharedChunks == 1 check stats.formatOverlap.hasKey("NEXTER-NIP-NPK") check stats.formatOverlap["NEXTER-NIP-NPK"] == 1