1 module squiz_box.box.tar; 2 3 import squiz_box.box; 4 import squiz_box.priv; 5 import squiz_box.squiz; 6 7 import std.datetime.systime; 8 import std.exception; 9 import std.path; 10 import std.range; 11 12 /// BoxAlgo for ".tar" files 13 class TarAlgo : BoxAlgo 14 { 15 ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize) 16 { 17 auto bytes = entries.boxTar(chunkSize); 18 return inputRangeObject(bytes); 19 } 20 21 UnboxEntryRange unbox(ByteRange bytes) 22 { 23 auto entries = bytes.unboxTar(); 24 return inputRangeObject(entries); 25 } 26 } 27 28 /// BoxAlgo for ".tar.gz" files 29 class TarGzAlgo : BoxAlgo 30 { 31 ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize) 32 { 33 auto bytes = entries.boxTarGz(chunkSize); 34 return inputRangeObject(bytes); 35 } 36 37 UnboxEntryRange unbox(ByteRange bytes) 38 { 39 auto entries = bytes.unboxTarGz(); 40 return inputRangeObject(entries); 41 } 42 } 43 44 version (HaveSquizBzip2) 45 { 46 /// BoxAlgo for ".tar.bz2" files 47 class TarBzip2Algo : BoxAlgo 48 { 49 ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize) 50 { 51 auto bytes = entries.boxTarBzip2(chunkSize); 52 return inputRangeObject(bytes); 53 } 54 55 UnboxEntryRange unbox(ByteRange bytes) 56 { 57 auto entries = bytes.unboxTarBzip2(); 58 return inputRangeObject(entries); 59 } 60 } 61 } 62 63 version (HaveSquizLzma) 64 { 65 /// BoxAlgo for ".tar.xz" files 66 class TarXzAlgo : BoxAlgo 67 { 68 ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize) 69 { 70 auto bytes = entries.boxTarXz(chunkSize); 71 return inputRangeObject(bytes); 72 } 73 74 UnboxEntryRange unbox(ByteRange bytes) 75 { 76 auto entries = bytes.unboxTarXz(); 77 return inputRangeObject(entries); 78 } 79 } 80 } 81 82 /// Returns a `.tar`, `.tar.gz`, `.tar.bz2` or `.tar.xz` archive as a byte range 83 /// corresponding to the entries in input. 84 /// chunkSize must be a multiple of 512. 85 auto boxTar(I)(I entries, size_t chunkSize = defaultChunkSize) 86 if (isBoxEntryRange!I) 87 in (chunkSize >= 512 && chunkSize % 512 == 0) 88 { 89 return TarBox!I(entries, chunkSize); 90 } 91 92 /// ditto 93 auto boxTarGz(I)(I entries, size_t chunkSize = defaultChunkSize) 94 { 95 return boxTar(entries, chunkSize).deflateGz(chunkSize); 96 } 97 98 version (HaveSquizBzip2) 99 { 100 /// ditto 101 auto boxTarBzip2(I)(I entries, size_t chunkSize = defaultChunkSize) 102 { 103 return boxTar(entries, chunkSize).compressBzip2(chunkSize); 104 } 105 } 106 107 version (HaveSquizLzma) 108 { 109 /// ditto 110 auto boxTarXz(I)(I entries, size_t chunkSize = defaultChunkSize) 111 { 112 return boxTar(entries, chunkSize).compressXz(chunkSize); 113 } 114 } 115 116 private struct TarBox(I) 117 { 118 // init data 119 I entriesInput; 120 ubyte[] buffer; 121 122 // current chunk (front data) 123 ubyte[] chunk; // data ready 124 ubyte[] avail; // space available in buffer (after chunk) 125 126 // current entry being processed 127 BoxEntry entry; 128 ByteRange entryChunks; 129 130 // footer is two empty blocks 131 size_t footer; 132 enum footerLen = 1024; 133 134 this(I entries, size_t chunkSize) 135 { 136 enforce(chunkSize % 512 == 0, "chunk size must be a multiple of 512"); 137 entriesInput = entries; 138 buffer = new ubyte[chunkSize]; 139 avail = buffer; 140 popFront(); 141 } 142 143 @property bool empty() 144 { 145 // handle .init 146 if (!buffer) 147 return true; 148 149 // more files to be processed 150 if (!entriesInput.empty) 151 return false; 152 153 // current entry not exhausted 154 if (hasEntryChunks()) 155 return false; 156 157 // some unconsumed flying data 158 if (chunk.length) 159 return false; 160 161 return true; 162 } 163 164 @property ByteChunk front() 165 { 166 return chunk; 167 } 168 169 void popFront() 170 { 171 if (!moreToRead()) 172 { 173 if (footer >= footerLen) 174 { 175 chunk = null; 176 } 177 else 178 { 179 import std.algorithm : min; 180 181 const len = min(buffer.length, footerLen - footer); 182 buffer[0 .. len] = 0; 183 chunk = buffer[0 .. len]; 184 footer += len; 185 } 186 return; 187 } 188 189 while (avail.length && moreToRead) 190 { 191 nextBlock(); 192 chunk = buffer[0 .. $ - avail.length]; 193 } 194 avail = buffer; 195 } 196 197 private bool hasEntryChunks() 198 { 199 return entryChunks && !entryChunks.empty; 200 } 201 202 private bool moreToRead() 203 { 204 return !entriesInput.empty || hasEntryChunks(); 205 } 206 207 private void nextBlock() 208 in (avail.length >= 512) 209 { 210 if (!entry || !hasEntryChunks()) 211 { 212 enforce(!entriesInput.empty); 213 entry = entriesInput.front; 214 entriesInput.popFront(); 215 avail = TarHeader.fillWith(entry, avail); 216 entryChunks = entry.byChunk(512); 217 } 218 else 219 { 220 auto filled = entryChunks.front; 221 avail[0 .. filled.length] = filled; 222 avail = avail[filled.length .. $]; 223 entryChunks.popFront(); 224 if (entryChunks.empty) 225 { 226 const pad = avail.length % 512; 227 avail[0 .. pad] = 0; 228 avail = avail[pad .. $]; 229 } 230 } 231 } 232 } 233 234 static assert(isByteRange!(TarBox!(BoxEntry[]))); 235 236 /// Returns a range of entries from a `.tar`, `.tar.gz`, `.tar.bz2` or `.tar.xz` formatted byte range 237 auto unboxTar(I)(I input) if (isByteRange!I) 238 { 239 auto dataInput = new ByteRangeCursor!I(input); 240 return TarUnbox(dataInput); 241 } 242 243 /// ditto 244 auto unboxTarGz(I)(I input) 245 { 246 return input.inflateGz().unboxTar(); 247 } 248 249 version (HaveSquizBzip2) 250 { 251 /// ditto 252 auto unboxTarBzip2(I)(I input) 253 { 254 return input.decompressBzip2().unboxTar(); 255 } 256 } 257 258 version (HaveSquizLzma) 259 { 260 /// ditto 261 auto unboxTarXz(I)(I input) 262 { 263 return input.decompressXz().unboxTar(); 264 } 265 } 266 267 private struct TarUnbox 268 { 269 private Cursor _input; 270 271 // current header data 272 private size_t _next; 273 private ubyte[] _block; 274 private UnboxEntry _entry; 275 276 this(Cursor input) 277 { 278 _input = input; 279 _block = new ubyte[512]; 280 281 // file with zero bytes is a valid tar file 282 if (!_input.eoi) 283 readHeaderBlock(); 284 } 285 286 @property bool empty() 287 { 288 return _input.eoi; 289 } 290 291 @property UnboxEntry front() 292 { 293 return _entry; 294 } 295 296 void popFront() 297 { 298 assert(_input.pos <= _next); 299 300 if (_input.pos < _next) 301 { 302 // the current entry was not fully read, we move the stream forward 303 // up to the next header 304 const dist = _next - _input.pos; 305 _input.ffw(dist); 306 } 307 readHeaderBlock(); 308 } 309 310 private void readHeaderBlock() 311 { 312 import std.conv : to; 313 314 enforce(_input.read(_block).length == 512, "Unexpected end of input"); 315 316 TarHeader* th = cast(TarHeader*) _block.ptr; 317 318 const computed = th.unsignedChecksum(); 319 const checksum = parseOctalString(th.chksum); 320 321 if (computed == 256 && checksum == 0) 322 { 323 // this is an empty header (only zeros) 324 // indicates end of archive 325 326 while (!_input.eoi) 327 { 328 _input.ffw(512); 329 } 330 return; 331 } 332 333 enforce( 334 checksum == computed, 335 "Invalid TAR checksum at 0x" ~ ( 336 _input.pos - 512 + th.chksum.offsetof) 337 .to!string(16) ~ 338 "\nExpected " ~ computed.to!string ~ " but found " ~ checksum.to!string, 339 ); 340 341 if (th.typeflag == Typeflag.posixExtended || th.typeflag == Typeflag.extended) 342 { 343 // skipping extended Tar headers 344 const sz = next512(parseOctalString!size_t(th.size)); 345 _input.ffw(sz); 346 readHeaderBlock(); 347 return; 348 } 349 350 TarEntryInfo info; 351 info.path = (parseString(th.prefix) ~ parseString(th.name)).idup; 352 info.type = toEntryType(th.typeflag); 353 info.linkname = parseString(th.linkname).idup; 354 info.size = parseOctalString!size_t(th.size); 355 info.entrySize = 512 + next512(info.size); 356 info.timeLastModified = SysTime(unixTimeToStdTime(parseOctalString!ulong(th.mtime))); 357 version (Posix) 358 { 359 // tar mode contains stat.st_mode & 07777. 360 // we have to add the missing flags corresponding to file type 361 // (and by no way tar mode is meaningful on Windows) 362 const filetype = posixModeFileType(th.typeflag); 363 info.attributes = parseOctalString(th.mode) | filetype; 364 info.ownerId = parseOctalString(th.uid); 365 info.groupId = parseOctalString(th.gid); 366 } 367 368 _entry = new TarUnboxEntry(_input, info); 369 370 _next = next512(_input.pos + info.size); 371 } 372 } 373 374 static assert(isUnboxEntryRange!TarUnbox); 375 376 struct TarEntryInfo 377 { 378 string path; 379 string linkname; 380 EntryType type; 381 ulong size; 382 ulong entrySize; 383 SysTime timeLastModified; 384 uint attributes; 385 386 version (Posix) 387 { 388 int ownerId; 389 int groupId; 390 } 391 } 392 393 private class TarUnboxEntry : UnboxEntry 394 { 395 import std.stdio : File; 396 397 private Cursor _input; 398 private size_t _start; 399 private size_t _end; 400 private TarEntryInfo _info; 401 402 this(Cursor input, TarEntryInfo info) 403 { 404 _input = input; 405 _start = input.pos; 406 _end = _start + info.size; 407 _info = info; 408 } 409 410 @property EntryMode mode() 411 { 412 return EntryMode.extraction; 413 } 414 415 @property string path() 416 { 417 return _info.path; 418 } 419 420 @property EntryType type() 421 { 422 return _info.type; 423 } 424 425 @property string linkname() 426 { 427 return _info.linkname; 428 } 429 430 @property size_t size() 431 { 432 return _info.size; 433 } 434 435 @property size_t entrySize() 436 { 437 return _info.entrySize; 438 } 439 440 @property SysTime timeLastModified() 441 { 442 return _info.timeLastModified; 443 } 444 445 @property uint attributes() 446 { 447 return _info.attributes; 448 } 449 450 version (Posix) 451 { 452 @property int ownerId() 453 { 454 return _info.ownerId; 455 } 456 457 @property int groupId() 458 { 459 return _info.groupId; 460 } 461 } 462 463 ByteRange byChunk(size_t chunkSize) 464 { 465 import std.range.interfaces : inputRangeObject; 466 467 enforce( 468 _input.pos == _start, 469 "Data cursor has moved, this entry is not valid anymore" 470 ); 471 return inputRangeObject(cursorByteRange(_input, _end - _input.pos, chunkSize)); 472 } 473 } 474 475 private struct TarHeader 476 { 477 // dfmt off 478 char [100] name; // 0 0 479 char [8] mode; // 100 64 480 char [8] uid; // 108 6C 481 char [8] gid; // 116 74 482 char [12] size; // 124 7C 483 char [12] mtime; // 136 88 484 char [8] chksum; // 148 94 485 Typeflag typeflag; // 156 9C 486 char [100] linkname; // 157 9D 487 char [6] magic; // 257 101 488 char [2] version_; // 263 107 489 char [32] uname; // 265 109 490 char [32] gname; // 297 129 491 char [8] devmajor; // 329 149 492 char [8] devminor; // 337 151 493 char [155] prefix; // 345 159 494 char [12] padding; // 500 1F4 495 //dfmt on 496 497 private static ubyte[] fillWith(ArchiveEntry file, ubyte[] block) 498 in (block.length >= 512) 499 { 500 import std.algorithm : min; 501 import std.string : toStringz; 502 503 version (Posix) 504 { 505 char[512] buf; 506 } 507 508 block[0 .. 512] = 0; 509 510 TarHeader* th = cast(TarHeader*)(&block[0]); 511 512 // prefix and name 513 const name = file.path; 514 const prefLen = name.length > 100 ? cast(ptrdiff_t) name.length - 100 : 0; 515 if (prefLen) 516 th.prefix[0 .. prefLen] = name[0 .. prefLen]; 517 th.name[0 .. name.length - prefLen] = name[prefLen .. $]; 518 519 th.typeflag = toTypeflag(file.type); 520 521 if (th.typeflag == Typeflag.symLink) 522 { 523 const lname = file.linkname; 524 const len = min(lname.length, cast(ptrdiff_t) th.linkname.length - 1); 525 th.linkname[0 .. len] = lname[0 .. len]; 526 } 527 528 version (Posix) 529 { 530 import core.sys.posix.grp; 531 import core.sys.posix.pwd; 532 import core.stdc.string : strlen; 533 import std.conv : octal; 534 535 const uid = file.ownerId; 536 const gid = file.groupId; 537 538 toOctalString(file.attributes & octal!7777, th.mode[0 .. $ - 1]); 539 toOctalString(uid, th.uid[0 .. $ - 1]); 540 toOctalString(gid, th.gid[0 .. $ - 1]); 541 542 if (uid != 0) 543 { 544 passwd pwdbuf; 545 passwd* pwd; 546 enforce(getpwuid_r(uid, &pwdbuf, buf.ptr, buf.length, &pwd) == 0, "Could not read user name"); 547 const urlen = min(strlen(pwd.pw_name), th.uname.length); 548 th.uname[0 .. urlen] = pwd.pw_name[0 .. urlen]; 549 } 550 551 if (gid != 0) 552 { 553 group grpbuf; 554 group* grp; 555 enforce(getgrgid_r(gid, &grpbuf, buf.ptr, buf.length, &grp) == 0, "Could not read group name"); 556 const grlen = min(strlen(grp.gr_name), th.gname.length); 557 th.gname[0 .. grlen] = grp.gr_name[0 .. grlen]; 558 } 559 } 560 else version (Windows) 561 { 562 // default to mode 644 which is the most common on UNIX 563 th.mode[0 .. 7] = "0000644"; 564 565 // TODO: https://docs.microsoft.com/fr-fr/windows/win32/secauthz/finding-the-owner-of-a-file-object-in-c-- 566 } 567 568 toOctalString(file.size, th.size[0 .. $ - 1]); 569 const mtime = file.timeLastModified().toUnixTime!long(); 570 toOctalString(mtime, th.mtime[0 .. $ - 1]); 571 572 th.magic = "ustar\0"; 573 th.version_ = "00"; 574 575 const chksum = th.unsignedChecksum(); 576 577 toOctalString(chksum, th.chksum[0 .. $ - 1]); 578 579 return block[512 .. $]; 580 } 581 582 private uint unsignedChecksum() 583 { 584 uint sum = 0; 585 sum += unsignedSum(name); 586 sum += unsignedSum(mode); 587 sum += unsignedSum(uid); 588 sum += unsignedSum(gid); 589 sum += unsignedSum(size); 590 sum += unsignedSum(mtime); 591 sum += 32 * 8; 592 sum += cast(uint) typeflag; 593 sum += unsignedSum(linkname); 594 sum += unsignedSum(magic); 595 sum += unsignedSum(version_); 596 sum += unsignedSum(uname); 597 sum += unsignedSum(gname); 598 sum += unsignedSum(devmajor); 599 sum += unsignedSum(devminor); 600 sum += unsignedSum(prefix); 601 return sum; 602 } 603 } 604 605 static assert(TarHeader.sizeof == 512); 606 607 private enum Typeflag : ubyte 608 { 609 normalNul = 0, 610 normal = '0', 611 hardLink = '1', 612 symLink = '2', 613 charSpecial = '3', 614 blockSpecial = '4', 615 directory = '5', 616 fifo = '6', 617 contiguousFile = '7', 618 posixExtended = 'g', 619 extended = 'x', 620 } 621 622 Typeflag toTypeflag(EntryType type) 623 { 624 final switch (type) 625 { 626 case EntryType.regular: 627 return Typeflag.normal; 628 case EntryType.directory: 629 return Typeflag.directory; 630 case EntryType.symlink: 631 return Typeflag.symLink; 632 } 633 } 634 635 EntryType toEntryType(Typeflag flag) 636 { 637 switch (flag) 638 { 639 case Typeflag.directory: 640 return EntryType.directory; 641 case Typeflag.symLink: 642 return EntryType.symlink; 643 default: 644 return EntryType.regular; 645 } 646 } 647 648 version (Posix) 649 { 650 // stat.st_mode part corresponding to file type 651 uint posixModeFileType(Typeflag flag) 652 { 653 import std.conv : octal; 654 import std.format : format; 655 656 switch (flag) 657 { 658 case Typeflag.normalNul: 659 case Typeflag.normal: 660 return octal!100_000; 661 case Typeflag.hardLink: 662 // is regular file right for hard links? 663 return octal!100_000; 664 case Typeflag.symLink: 665 return octal!120_000; 666 case Typeflag.charSpecial: 667 return octal!20_000; 668 case Typeflag.blockSpecial: 669 return octal!60_000; 670 case Typeflag.directory: 671 return octal!40_000; 672 case Typeflag.fifo: 673 return octal!10_000; 674 case Typeflag.contiguousFile: 675 // is regular file right for contiguous files? 676 return octal!100_000; 677 default: 678 throw new Exception(format!"Unexpected Tar entry type: '%s'"(cast(char) flag)); 679 } 680 } 681 } 682 683 private uint unsignedSum(const(char)[] buf) 684 { 685 uint sum; 686 foreach (ubyte b; cast(const(ubyte)[]) buf) 687 { 688 sum += cast(uint) b; 689 } 690 return sum; 691 } 692 693 private void toOctalString(T)(T val, char[] buf) 694 { 695 import std.format : sformat; 696 697 sformat(buf, "%0*o", buf.length, val); 698 } 699 700 private T parseOctalString(T = uint)(const(char)[] octal) 701 { 702 import std.algorithm : countUntil; 703 import std.conv : parse; 704 import std.range : retro; 705 706 size_t nuls = retro(octal).countUntil!(c => c != '\0'); 707 708 if (nuls == octal.length || nuls == -1) 709 return 0; 710 711 auto src = octal[0 .. $ - nuls]; 712 713 return parse!(T)(src, 8); 714 } 715 716 private char[] parseString(char[] chars) 717 { 718 import core.stdc.string : strlen; 719 720 const len = strlen(chars.ptr); 721 return chars[0 .. len]; 722 } 723 724 private size_t next512(size_t off) 725 { 726 const rem = off % 512; 727 if (rem == 0) 728 return off; 729 return off + 512 - rem; 730 } 731 732 @("next512") 733 unittest 734 { 735 assert(next512(0) == 0); 736 assert(next512(1) == 512); 737 assert(next512(300) == 512); 738 assert(next512(511) == 512); 739 assert(next512(512) == 512); 740 assert(next512(1024) == 1024); 741 assert(next512(1025) == 1536); 742 assert(next512(1225) == 1536); 743 assert(next512(1535) == 1536); 744 assert(next512(1536) == 1536); 745 }