From 5ba50c26a94b9c76d20b6c2bdffe7fac6b091cd2 Mon Sep 17 00:00:00 2001
From: Seth Hillbrand <seth@kipro-pcb.com>
Date: Mon, 19 Aug 2024 14:28:55 -0700
Subject: [PATCH] Update embedded files hash to use Murmur3

SHA256 is fine for one-offs but for large libraries where we might be
running the hash on hundreds of files, the speed difference is
appreciable.  We don't require crytographic hashing, just a check that
the original file hasn't been corrupted so Murmur3 satisfies our basic
requirement.
---
 common/embedded_files.cpp                     | 33 ++++++++++++-------
 eeschema/sch_file_versions.h                  |  7 ++--
 eeschema/sch_screen.cpp                       |  2 +-
 include/embedded_files.h                      | 25 +++++++++++---
 libs/kimath/include/mmh3_hash.h               | 32 ++++++++++++++++++
 pcbnew/board.cpp                              |  2 +-
 .../pcb_io/kicad_sexpr/pcb_io_kicad_sexpr.h   |  3 +-
 .../common/test_embedded_file_compress.cpp    | 24 ++++++++++----
 8 files changed, 100 insertions(+), 28 deletions(-)

diff --git a/common/embedded_files.cpp b/common/embedded_files.cpp
index 630fca0fa6..c88326c04b 100644
--- a/common/embedded_files.cpp
+++ b/common/embedded_files.cpp
@@ -33,6 +33,7 @@
 
 #include <embedded_files.h>
 #include <kiid.h>
+#include <mmh3_hash.h>
 #include <paths.h>
 
 
@@ -199,7 +200,7 @@ void EMBEDDED_FILES::WriteEmbeddedFiles( OUTPUTFORMATTER& aOut, int aNestLevel,
             aOut.Print( aNestLevel + 2, ")\n" ); // Close data
         }
 
-        aOut.Print( aNestLevel + 2, "(checksum \"%s\")\n", file.data_sha.c_str() );
+        aOut.Print( aNestLevel + 2, "(checksum \"%s\")\n", file.data_hash.c_str() );
         aOut.Print( aNestLevel + 1, ")\n" ); // Close file
     }
 
@@ -232,7 +233,9 @@ EMBEDDED_FILES::RETURN_CODE EMBEDDED_FILES::CompressAndEncode( EMBEDDED_FILE& aF
         return RETURN_CODE::OUT_OF_MEMORY;
     }
 
-    picosha2::hash256_hex_string( aFile.decompressedData, aFile.data_sha );
+    MMH3_HASH hash( EMBEDDED_FILES::Seed() );
+    hash.add( aFile.decompressedData );
+    aFile.data_hash = hash.digest().ToString();
 
     return RETURN_CODE::OK;
 }
@@ -285,11 +288,19 @@ EMBEDDED_FILES::RETURN_CODE EMBEDDED_FILES::DecompressAndDecode( EMBEDDED_FILE&
     }
 
     aFile.decompressedData.resize( decompressedSize );
+    std::string test_hash;
+    std::string new_hash;
 
-    std::string new_sha;
-    picosha2::hash256_hex_string( aFile.decompressedData, new_sha );
+    MMH3_HASH hash( EMBEDDED_FILES::Seed() );
+    hash.add( aFile.decompressedData );
+    new_hash = hash.digest().ToString();
 
-    if( new_sha != aFile.data_sha )
+    if( aFile.data_hash.length() == 64 )
+        picosha2::hash256_hex_string( aFile.decompressedData, test_hash );
+    else
+        test_hash = new_hash;
+
+    if( test_hash != aFile.data_hash )
     {
         wxLogTrace( wxT( "KICAD_EMBED" ),
                     wxT( "%s:%s:%d\n * Checksum error in embedded file '%s'" ),
@@ -298,6 +309,8 @@ EMBEDDED_FILES::RETURN_CODE EMBEDDED_FILES::DecompressAndDecode( EMBEDDED_FILE&
         return RETURN_CODE::CHECKSUM_ERROR;
     }
 
+    aFile.data_hash = new_hash;
+
     return RETURN_CODE::OK;
 }
 
@@ -355,7 +368,7 @@ void EMBEDDED_FILES_PARSER::ParseEmbedded( EMBEDDED_FILES* aFiles )
                 if( !IsSymbol( token ) )
                     Expecting( "checksum data" );
 
-                file->data_sha = CurStr();
+                file->data_hash = CurStr();
                 NeedRIGHT();
                 break;
 
@@ -434,9 +447,7 @@ void EMBEDDED_FILES_PARSER::ParseEmbedded( EMBEDDED_FILES* aFiles )
     {
         if( !file->compressedEncodedData.empty() )
         {
-            EMBEDDED_FILES::DecompressAndDecode( *file );
-
-            if( !file->Validate() )
+            if( EMBEDDED_FILES::DecompressAndDecode( *file ) == EMBEDDED_FILES::RETURN_CODE::CHECKSUM_ERROR )
                 THROW_PARSE_ERROR( "Checksum error in embedded file " + file->name, CurSource(),
                                     CurLine(), CurLineNumber(), CurOffset() );
         }
@@ -469,9 +480,9 @@ wxFileName EMBEDDED_FILES::GetTemporaryFileName( const wxString& aName ) const
 
     wxFileName inputName( aName );
 
-    // Store the cache file name using the data SHA to allow for shared data between
+    // Store the cache file name using the data hash to allow for shared data between
     // multiple projects using the same files as well as deconflicting files with the same name
-    cacheFile.SetName( "kicad_embedded_" + it->second->data_sha );
+    cacheFile.SetName( "kicad_embedded_" + it->second->data_hash );
     cacheFile.SetExt( inputName.GetExt() );
 
     if( cacheFile.FileExists() && cacheFile.IsFileReadable() )
diff --git a/eeschema/sch_file_versions.h b/eeschema/sch_file_versions.h
index e764895f08..3fbb9e275f 100644
--- a/eeschema/sch_file_versions.h
+++ b/eeschema/sch_file_versions.h
@@ -50,8 +50,8 @@
 //#define SEXPR_SYMBOL_LIB_FILE_VERSION  20220914   // Don't save property ID
 //#define SEXPR_SYMBOL_LIB_FILE_VERSION  20230620   // ki_description -> Description Field
 //#define SEXPR_SYMBOL_LIB_FILE_VERSION  20231120   // generator_version; V8 cleanups
-#define  SEXPR_SYMBOL_LIB_FILE_VERSION  20240529   // Embedded Files
-
+//#define SEXPR_SYMBOL_LIB_FILE_VERSION  20240529   // Embedded Files
+#define  SEXPR_SYMBOL_LIB_FILE_VERSION  20240819    // Embedded Files - Update hash algorithm to Murmur3
 /**
  * Schematic file version.
  */
@@ -109,4 +109,5 @@
 //#define SEXPR_SCHEMATIC_FILE_VERSION 20240602  // Sheet attributes
 //#define SEXPR_SCHEMATIC_FILE_VERSION 20240620  // Embedded Files
 //#define SEXPR_SCHEMATIC_FILE_VERSION 20240716  // Multiple netclass assignments
-#define   SEXPR_SCHEMATIC_FILE_VERSION 20240812  // Netclass color highlighting
+//#define SEXPR_SCHEMATIC_FILE_VERSION 20240812  // Netclass color highlighting
+#define SEXPR_SCHEMATIC_FILE_VERSION 20240819    // Embedded Files - Update hash algorithm to Murmur3
diff --git a/eeschema/sch_screen.cpp b/eeschema/sch_screen.cpp
index b2b87eeeb0..b7a4535209 100644
--- a/eeschema/sch_screen.cpp
+++ b/eeschema/sch_screen.cpp
@@ -1502,7 +1502,7 @@ void SCH_SCREEN::FixupEmbeddedData()
             {
                 embeddedFile->compressedEncodedData = file->compressedEncodedData;
                 embeddedFile->decompressedData = file->decompressedData;
-                embeddedFile->data_sha = file->data_sha;
+                embeddedFile->data_hash = file->data_hash;
                 embeddedFile->is_valid = file->is_valid;
             }
         }
diff --git a/include/embedded_files.h b/include/embedded_files.h
index 4df886f67a..5d68fb09ff 100644
--- a/include/embedded_files.h
+++ b/include/embedded_files.h
@@ -26,9 +26,10 @@
 #include <wx/filename.h>
 
 #include <embedded_files_lexer.h>
-#include <wildcards_and_files_ext.h>
-#include <richio.h>
+#include <mmh3_hash.h>
 #include <picosha2.h>
+#include <richio.h>
+#include <wildcards_and_files_ext.h>
 
 class EMBEDDED_FILES
 {
@@ -50,11 +51,22 @@ public:
         {}
 
         bool Validate()
+        {
+            MMH3_HASH hash( EMBEDDED_FILES::Seed() );
+            hash.add( decompressedData );
+
+            is_valid = ( hash.digest().ToString() == data_hash );
+            return is_valid;
+        }
+
+        // This is the old way of validating the file.  It is deprecated and retained only
+        // to validate files that were previously embedded.
+        bool Validate_SHA256()
         {
             std::string new_sha;
             picosha2::hash256_hex_string( decompressedData, new_sha );
 
-            is_valid = ( new_sha == data_sha );
+            is_valid = ( new_sha == data_hash );
             return is_valid;
         }
 
@@ -68,7 +80,7 @@ public:
         bool              is_valid;
         std::string       compressedEncodedData;
         std::vector<char> decompressedData;
-        std::string       data_sha;
+        std::string       data_hash;
     };
 
     enum class RETURN_CODE : int
@@ -216,6 +228,11 @@ public:
         return m_embedFonts;
     }
 
+    static uint32_t Seed()
+    {
+        return 0xABBA2345;
+    }
+
 private:
     std::map<wxString, EMBEDDED_FILE*> m_files;
     std::vector<wxString>              m_fontFiles;
diff --git a/libs/kimath/include/mmh3_hash.h b/libs/kimath/include/mmh3_hash.h
index 35d4e6a700..9f8fb9db53 100644
--- a/libs/kimath/include/mmh3_hash.h
+++ b/libs/kimath/include/mmh3_hash.h
@@ -70,6 +70,38 @@ public:
         len = 0;
     }
 
+    FORCE_INLINE void addData( const uint8_t* data, size_t length )
+    {
+        size_t remaining = length;
+
+        while( remaining >= 16 )
+        {
+            memcpy( blocks, data, 16 );
+            hashBlock();
+            data += 16;
+            remaining -= 16;
+            len += 16;
+        }
+
+        if( remaining > 0 )
+        {
+            memcpy( blocks, data, remaining );
+            size_t padding = 4 - ( remaining + 4 ) % 4;
+            memset( reinterpret_cast<uint8_t*>( blocks ) + remaining, 0, padding );
+            len += remaining + padding;
+        }
+    }
+
+    FORCE_INLINE void add( const std::string& input )
+    {
+        addData( reinterpret_cast<const uint8_t*>( input.data() ), input.length() );
+    }
+
+    FORCE_INLINE void add( const std::vector<char>& input )
+    {
+        addData( reinterpret_cast<const uint8_t*>( input.data() ), input.size() );
+    }
+
     FORCE_INLINE void add( int32_t input )
     {
         blocks[( len % 16 ) / 4] = input;
diff --git a/pcbnew/board.cpp b/pcbnew/board.cpp
index 23e262e9e5..b1461569d9 100644
--- a/pcbnew/board.cpp
+++ b/pcbnew/board.cpp
@@ -974,7 +974,7 @@ void BOARD::FixupEmbeddedData()
             {
                 embeddedFile->compressedEncodedData = file->compressedEncodedData;
                 embeddedFile->decompressedData = file->decompressedData;
-                embeddedFile->data_sha = file->data_sha;
+                embeddedFile->data_hash = file->data_hash;
                 embeddedFile->is_valid = file->is_valid;
             }
         }
diff --git a/pcbnew/pcb_io/kicad_sexpr/pcb_io_kicad_sexpr.h b/pcbnew/pcb_io/kicad_sexpr/pcb_io_kicad_sexpr.h
index 83d3baaf82..3ecfb1d3ec 100644
--- a/pcbnew/pcb_io/kicad_sexpr/pcb_io_kicad_sexpr.h
+++ b/pcbnew/pcb_io/kicad_sexpr/pcb_io_kicad_sexpr.h
@@ -160,7 +160,8 @@ class PCB_IO_KICAD_SEXPR;   // forward decl
 //#define SEXPR_BOARD_FILE_VERSION    20240609  // Add 'tenting' keyword
 //#define SEXPR_BOARD_FILE_VERSION    20240617  // Table angles
 //#define SEXPR_BOARD_FILE_VERSION    20240703  // User layer types
-#define SEXPR_BOARD_FILE_VERSION      20240706  // Embedded Files
+//#define SEXPR_BOARD_FILE_VERSION    20240706  // Embedded Files
+#define SEXPR_BOARD_FILE_VERSION      20240819  // Embedded Files - Update hash algorithm to Murmur3
 
 #define BOARD_FILE_HOST_VERSION       20200825  ///< Earlier files than this include the host tag
 #define LEGACY_ARC_FORMATTING         20210925  ///< These were the last to use old arc formatting
diff --git a/qa/tests/common/test_embedded_file_compress.cpp b/qa/tests/common/test_embedded_file_compress.cpp
index 56a1731bb1..6e51797545 100644
--- a/qa/tests/common/test_embedded_file_compress.cpp
+++ b/qa/tests/common/test_embedded_file_compress.cpp
@@ -19,7 +19,7 @@
 
 #include <magic_enum.hpp>
 #include <boost/test/unit_test.hpp>
-#include <picosha2.h>
+#include <mmh3_hash.h>
 #include <embedded_files.h>
 
 #include <random>
@@ -34,7 +34,9 @@ BOOST_AUTO_TEST_CASE( CompressAndEncode_OK )
     std::string data = "Hello, World!";
     file.decompressedData.assign(data.begin(), data.end());
 
-    picosha2::hash256_hex_string(file.decompressedData, file.data_sha);
+    MMH3_HASH hash( EMBEDDED_FILES::Seed() );
+    hash.add( file.decompressedData );
+    file.data_hash = hash.digest().ToString();
 
     EMBEDDED_FILES::RETURN_CODE result = EMBEDDED_FILES::CompressAndEncode(file);
     BOOST_CHECK_EQUAL(result, EMBEDDED_FILES::RETURN_CODE::OK);
@@ -47,7 +49,9 @@ BOOST_AUTO_TEST_CASE( DecompressAndDecode_OK )
     std::string data = "Hello, World!";
     file.decompressedData.assign( data.begin(), data.end() );
 
-    picosha2::hash256_hex_string( file.decompressedData, file.data_sha );
+    MMH3_HASH hash( EMBEDDED_FILES::Seed() );
+    hash.add( file.decompressedData );
+    file.data_hash = hash.digest().ToString();
 
     EMBEDDED_FILES::RETURN_CODE result = EMBEDDED_FILES::CompressAndEncode( file );
     BOOST_CHECK_EQUAL( result, EMBEDDED_FILES::RETURN_CODE::OK );
@@ -64,7 +68,9 @@ BOOST_AUTO_TEST_CASE( DecompressAndDecode_OK )
 
     file.decompressedData.assign( data.begin(), data.end() );
 
-    picosha2::hash256_hex_string( file.decompressedData, file.data_sha );
+    hash.reset();
+    hash.add( file.decompressedData );
+    file.data_hash = hash.digest().ToString();
 
     result = EMBEDDED_FILES::CompressAndEncode( file );
     BOOST_CHECK_EQUAL( result, EMBEDDED_FILES::RETURN_CODE::OK );
@@ -80,7 +86,9 @@ BOOST_AUTO_TEST_CASE( DecompressAndDecode_OK )
         data += static_cast<char>( i % 256 );
 
     file.decompressedData.assign( data.begin(), data.end() );
-    picosha2::hash256_hex_string( file.decompressedData, file.data_sha );
+    hash.reset();
+    hash.add( file.decompressedData );
+    file.data_hash = hash.digest().ToString();
 
     result = EMBEDDED_FILES::CompressAndEncode( file );
     BOOST_CHECK_EQUAL( result, EMBEDDED_FILES::RETURN_CODE::OK );
@@ -99,7 +107,9 @@ BOOST_AUTO_TEST_CASE( DecompressAndDecode_OK )
         data += static_cast<char>( rng() % 256 );
 
     file.decompressedData.assign( data.begin(), data.end() );
-    picosha2::hash256_hex_string( file.decompressedData, file.data_sha );
+    hash.reset();
+    hash.add( file.decompressedData );
+    file.data_hash = hash.digest().ToString();
 
     result = EMBEDDED_FILES::CompressAndEncode( file );
     BOOST_CHECK_EQUAL( result, EMBEDDED_FILES::RETURN_CODE::OK );
@@ -120,7 +130,7 @@ BOOST_AUTO_TEST_CASE( DecompressAndDecode_ChecksumError )
     BOOST_CHECK_EQUAL(result, EMBEDDED_FILES::RETURN_CODE::OK);
 
     // Modify the checksum
-    file.data_sha[0] = 'x';
+    file.data_hash[0] = 'x';
 
     result = EMBEDDED_FILES::DecompressAndDecode(file);
     BOOST_CHECK_EQUAL(result, EMBEDDED_FILES::RETURN_CODE::CHECKSUM_ERROR);