88using System . IO . Compression ;
99using System . Linq ;
1010using System . Runtime . Intrinsics ;
11+ #if IsARM64
12+ using System . Runtime . Intrinsics . Arm ;
13+ #else
1114using System . Runtime . Intrinsics . X86 ;
15+ #endif
1216using System . Threading ;
1317using System . Threading . Tasks ;
18+ #if UseZSTD
1419using ZstdNet ;
20+ #endif
1521using static Hi3Helper . SharpHDiffPatch . StreamExtension ;
1622
1723namespace Hi3Helper . SharpHDiffPatch
@@ -128,7 +134,9 @@ public void GetDecompressStreamPlugin(CompressionMode type, Stream sourceStream,
128134 decompStream = type switch
129135 {
130136 CompressionMode . nocomp => rawStream ,
131- CompressionMode . zstd => new DecompressionStream ( rawStream , new DecompressionOptions ( null , new Dictionary < ZSTD_dParameter , int > ( )
137+ CompressionMode . zstd =>
138+ #if UseZSTD
139+ new DecompressionStream ( rawStream , new DecompressionOptions ( null , new Dictionary < ZSTD_dParameter , int > ( )
132140 {
133141 /* HACK: The default window log max size is 30. This is unacceptable since the native HPatch implementation
134142 * always use 31 as the size_t, which is 8 bytes length.
@@ -138,13 +146,16 @@ public void GetDecompressStreamPlugin(CompressionMode type, Stream sourceStream,
138146 */
139147 { ZSTD_dParameter . ZSTD_d_windowLogMax , 31 }
140148 } ) , 0 ) ,
149+ #else
150+ throw new NotSupportedException ( $ "[PatchCore::GetDecompressStreamPlugin] Compression Type: zstd is not supported in this build of SharpHDiffPatch!") ,
151+ #endif
141152 CompressionMode. zlib => new DeflateStream( rawStream , System . IO . Compression . CompressionMode . Decompress , true ) ,
142153 CompressionMode . bz2 => new CBZip2InputStream ( rawStream , false , true ) ,
143154 CompressionMode . pbz2 => new CBZip2InputStream ( rawStream , true , true ) ,
144155 CompressionMode . lzma => CreateLzmaStream ( rawStream ) ,
145156 CompressionMode . lzma2 => CreateLzmaStream ( rawStream ) ,
146157 _ => throw new NotSupportedException ( $ "[PatchCore::GetDecompressStreamPlugin] Compression Type: { type } is not supported")
147- } ;
158+ } ; ;
148159 }
149160
150161 private Stream CreateLzmaStream ( Stream rawStream )
@@ -167,11 +178,11 @@ private Stream CreateLzmaStream(Stream rawStream)
167178 }
168179 else
169180 {
170- byte [ ] props = new byte [ propLen ] ;
171- rawStream . Read ( props ) ;
181+ // byte[] props = new byte[propLen];
182+ // rawStream.Read(props);
172183
173184 // return new LzmaDecoderStream(rawStream, props, long.MaxValue);
174- throw new NotSupportedException ( $ "LZMA compression is not supported! only LZMA2 is currently supported!") ;
185+ throw new NotSupportedException ( $ "[PatchCore::CreateLzmaStream] LZMA compression is not supported! only LZMA2 is currently supported!") ;
175186 }
176187 }
177188
@@ -472,7 +483,9 @@ internal void TBytesSetRleSingle(ref RLERefClipStruct rleLoader, Stream outCache
472483 outCache . Read ( sharedBuffer , 0 , length ) ;
473484 outCache . Position = lastPos ;
474485
475- do sharedBuffer [ -- length ] += rleLoader . memSetValue ; while ( length > 0 ) ;
486+ SetAddRLESingle :
487+ sharedBuffer [ -- length ] += rleLoader . memSetValue ;
488+ if ( length > 0 ) goto SetAddRLESingle ;
476489
477490 outCache . Write ( sharedBuffer , 0 , ( int ) memSetStep ) ;
478491 }
@@ -486,14 +499,97 @@ internal void TBytesSetRleSingle(ref RLERefClipStruct rleLoader, Stream outCache
486499 }
487500 }
488501
502+ internal unsafe void TBytesSetRleVectorV2 ( ref RLERefClipStruct rleLoader , Stream outCache , ref long copyLength , int decodeStep , byte * rlePtr , byte [ ] rleBuffer , int rleBufferIdx , byte * oldPtr )
503+ {
504+ int len = decodeStep ;
505+ #if IsARM64
506+ if ( Vector128 . IsHardwareAccelerated && len >= Vector128 < byte > . Count )
507+ {
508+ AddVectorArm64_128 :
509+ len -= Vector128 < byte > . Count ;
510+ Vector128 < byte > resultVector = AdvSimd . Add ( * ( Vector128 < byte > * ) ( rlePtr + len ) , * ( Vector128 < byte > * ) ( oldPtr + len ) ) ;
511+ AdvSimd . Store ( rlePtr + len , resultVector ) ;
512+ if ( len > Vector128 < byte > . Count ) goto AddVectorArm64_128 ;
513+ }
514+ else if ( Vector64 . IsHardwareAccelerated && len >= Vector64 < byte > . Count )
515+ {
516+ AddVectorArm64_64 :
517+ len -= Vector64 < byte > . Count ;
518+ Vector64 < byte > resultVector = AdvSimd . Add ( * ( Vector64 < byte > * ) ( rlePtr + len ) , * ( Vector64 < byte > * ) ( oldPtr + len ) ) ;
519+ AdvSimd . Store ( rlePtr + len , resultVector ) ;
520+ if ( len > Vector64 < byte > . Count ) goto AddVectorArm64_64 ;
521+ }
522+ #else
523+ if ( Sse2 . IsSupported && len >= Vector128 < byte > . Count )
524+ {
525+ AddVectorSse2 :
526+ len -= Vector128 < byte > . Count ;
527+ Vector128 < byte > resultVector = Sse2 . Add ( * ( Vector128 < byte > * ) ( rlePtr + len ) , * ( Vector128 < byte > * ) ( oldPtr + len ) ) ;
528+ Sse2 . Store ( rlePtr + len , resultVector ) ;
529+ if ( len > Vector128 < byte > . Count ) goto AddVectorSse2 ;
530+ }
531+ #endif
532+
533+ if ( len >= 4 )
534+ {
535+ AddRemainsFourStep :
536+ len -= 4 ;
537+ * ( rlePtr + len ) + = * ( oldPtr + len ) ;
538+ * ( rlePtr + 1 + len ) + = * ( oldPtr + 1 + len ) ;
539+ * ( rlePtr + 2 + len ) + = * ( oldPtr + 2 + len ) ;
540+ * ( rlePtr + 3 + len ) + = * ( oldPtr + 3 + len ) ;
541+ if ( len >= 4 ) goto AddRemainsFourStep;
542+ }
543+
544+ AddRemainsVectorRLE:
545+ if ( len == 0 ) goto WriteAllVectorRLE;
546+ * ( rlePtr + -- len ) + = * ( oldPtr + len ) ;
547+ goto AddRemainsVectorRLE;
548+
549+ WriteAllVectorRLE:
550+ outCache . Write ( rleBuffer . AsSpan ( rleBufferIdx , decodeStep ) ) ;
551+
552+ rleLoader . memCopyLength -= decodeStep ;
553+ copyLength -= decodeStep ;
554+ }
555+
489556 internal unsafe void TBytesSetRleVector ( ref RLERefClipStruct rleLoader , Stream outCache , ref long copyLength , int decodeStep , byte * rlePtr , byte [ ] rleBuffer , int rleBufferIdx , byte * oldPtr )
490557 {
491558 int offset = 0 ;
492559 long offsetRemained = 0 ;
493560
561+ #if IsARM64
562+ if ( Vector128 . IsHardwareAccelerated && decodeStep >= Vector128 < byte > . Count )
563+ {
564+ offsetRemained = decodeStep % Vector128 < byte > . Count ;
565+
566+ AddVectorArm64_128:
567+ Vector128 < byte > * rleVector = ( Vector128 < byte > * ) ( rlePtr + offset ) ;
568+ Vector128 < byte > * oldVector = ( Vector128 < byte > * ) ( oldPtr + offset ) ;
569+ Vector128 < byte > resultVector = AdvSimd . Add ( * rleVector , * oldVector ) ;
570+
571+ AdvSimd. Store ( rlePtr + offset , resultVector ) ;
572+ offset += Vector128 < byte > . Count ;
573+ if ( offset < decodeStep - offsetRemained ) goto AddVectorArm64_128;
574+ }
575+ else if ( Vector64 . IsHardwareAccelerated && decodeStep >= Vector64 < byte > . Count )
576+ {
577+ offsetRemained = decodeStep % Vector64 < byte > . Count ;
578+
579+ AddVectorArm64_64:
580+ Vector64 < byte > * rleVector = ( Vector64 < byte > * ) ( rlePtr + offset ) ;
581+ Vector64 < byte > * oldVector = ( Vector64 < byte > * ) ( oldPtr + offset ) ;
582+ Vector64 < byte > resultVector = AdvSimd . Add ( * rleVector , * oldVector ) ;
583+
584+ AdvSimd. Store ( rlePtr + offset , resultVector ) ;
585+ offset += Vector64 < byte > . Count ;
586+ if ( offset < decodeStep - offsetRemained ) goto AddVectorArm64_64;
587+ }
588+ #else
494589 if ( Sse2 . IsSupported && decodeStep >= Vector128 < byte > . Count )
495590 {
496591 offsetRemained = decodeStep % Vector128 < byte > . Count ;
592+
497593 AddVectorSse2:
498594 Vector128 < byte > * rleVector = ( Vector128 < byte > * ) ( rlePtr + offset ) ;
499595 Vector128 < byte > * oldVector = ( Vector128 < byte > * ) ( oldPtr + offset ) ;
@@ -503,6 +599,7 @@ internal unsafe void TBytesSetRleVector(ref RLERefClipStruct rleLoader, Stream o
503599 offset += Vector128 < byte > . Count ;
504600 if ( offset < decodeStep - offsetRemained ) goto AddVectorSse2;
505601 }
602+ #endif
506603
507604 if ( offsetRemained != 0 && ( offsetRemained % 4 ) == 0 )
508605 {
0 commit comments