1 module squiz_box.box.tar;
2 
3 import squiz_box.box;
4 import squiz_box.priv;
5 import squiz_box.squiz;
6 
7 import std.datetime.systime;
8 import std.exception;
9 import std.path;
10 import std.range;
11 
12 /// BoxAlgo for ".tar" files
13 class TarAlgo : BoxAlgo
14 {
15     ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize)
16     {
17         auto bytes = entries.boxTar(chunkSize);
18         return inputRangeObject(bytes);
19     }
20 
21     UnboxEntryRange unbox(ByteRange bytes)
22     {
23         auto entries = bytes.unboxTar();
24         return inputRangeObject(entries);
25     }
26 }
27 
28 /// BoxAlgo for ".tar.gz" files
29 class TarGzAlgo : BoxAlgo
30 {
31     ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize)
32     {
33         auto bytes = entries.boxTarGz(chunkSize);
34         return inputRangeObject(bytes);
35     }
36 
37     UnboxEntryRange unbox(ByteRange bytes)
38     {
39         auto entries = bytes.unboxTarGz();
40         return inputRangeObject(entries);
41     }
42 }
43 
44 version (HaveSquizBzip2)
45 {
46     /// BoxAlgo for ".tar.bz2" files
47     class TarBzip2Algo : BoxAlgo
48     {
49         ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize)
50         {
51             auto bytes = entries.boxTarBzip2(chunkSize);
52             return inputRangeObject(bytes);
53         }
54 
55         UnboxEntryRange unbox(ByteRange bytes)
56         {
57             auto entries = bytes.unboxTarBzip2();
58             return inputRangeObject(entries);
59         }
60     }
61 }
62 
63 version (HaveSquizLzma)
64 {
65     /// BoxAlgo for ".tar.xz" files
66     class TarXzAlgo : BoxAlgo
67     {
68         ByteRange box(BoxEntryRange entries, size_t chunkSize = defaultChunkSize)
69         {
70             auto bytes = entries.boxTarXz(chunkSize);
71             return inputRangeObject(bytes);
72         }
73 
74         UnboxEntryRange unbox(ByteRange bytes)
75         {
76             auto entries = bytes.unboxTarXz();
77             return inputRangeObject(entries);
78         }
79     }
80 }
81 
82 /// Returns a `.tar`, `.tar.gz`, `.tar.bz2` or `.tar.xz` archive as a byte range
83 /// corresponding to the entries in input.
84 /// chunkSize must be a multiple of 512.
85 auto boxTar(I)(I entries, size_t chunkSize = defaultChunkSize)
86         if (isBoxEntryRange!I)
87 in (chunkSize >= 512 && chunkSize % 512 == 0)
88 {
89     return TarBox!I(entries, chunkSize);
90 }
91 
92 /// ditto
93 auto boxTarGz(I)(I entries, size_t chunkSize = defaultChunkSize)
94 {
95     return boxTar(entries, chunkSize).deflateGz(chunkSize);
96 }
97 
98 version (HaveSquizBzip2)
99 {
100     /// ditto
101     auto boxTarBzip2(I)(I entries, size_t chunkSize = defaultChunkSize)
102     {
103         return boxTar(entries, chunkSize).compressBzip2(chunkSize);
104     }
105 }
106 
107 version (HaveSquizLzma)
108 {
109     /// ditto
110     auto boxTarXz(I)(I entries, size_t chunkSize = defaultChunkSize)
111     {
112         return boxTar(entries, chunkSize).compressXz(chunkSize);
113     }
114 }
115 
116 private struct TarBox(I)
117 {
118     // init data
119     I entriesInput;
120     ubyte[] buffer;
121 
122     // current chunk (front data)
123     ubyte[] chunk; // data ready
124     ubyte[] avail; // space available in buffer (after chunk)
125 
126     // current entry being processed
127     BoxEntry entry;
128     ByteRange entryChunks;
129 
130     // footer is two empty blocks
131     size_t footer;
132     enum footerLen = 1024;
133 
134     this(I entries, size_t chunkSize)
135     {
136         enforce(chunkSize % 512 == 0, "chunk size must be a multiple of 512");
137         entriesInput = entries;
138         buffer = new ubyte[chunkSize];
139         avail = buffer;
140         popFront();
141     }
142 
143     @property bool empty()
144     {
145         // handle .init
146         if (!buffer)
147             return true;
148 
149         // more files to be processed
150         if (!entriesInput.empty)
151             return false;
152 
153         // current entry not exhausted
154         if (hasEntryChunks())
155             return false;
156 
157         // some unconsumed flying data
158         if (chunk.length)
159             return false;
160 
161         return true;
162     }
163 
164     @property ByteChunk front()
165     {
166         return chunk;
167     }
168 
169     void popFront()
170     {
171         if (!moreToRead())
172         {
173             if (footer >= footerLen)
174             {
175                 chunk = null;
176             }
177             else
178             {
179                 import std.algorithm : min;
180 
181                 const len = min(buffer.length, footerLen - footer);
182                 buffer[0 .. len] = 0;
183                 chunk = buffer[0 .. len];
184                 footer += len;
185             }
186             return;
187         }
188 
189         while (avail.length && moreToRead)
190         {
191             nextBlock();
192             chunk = buffer[0 .. $ - avail.length];
193         }
194         avail = buffer;
195     }
196 
197     private bool hasEntryChunks()
198     {
199         return entryChunks && !entryChunks.empty;
200     }
201 
202     private bool moreToRead()
203     {
204         return !entriesInput.empty || hasEntryChunks();
205     }
206 
207     private void nextBlock()
208     in (avail.length >= 512)
209     {
210         if (!entry || !hasEntryChunks())
211         {
212             enforce(!entriesInput.empty);
213             entry = entriesInput.front;
214             entriesInput.popFront();
215             avail = TarHeader.fillWith(entry, avail);
216             entryChunks = entry.byChunk(512);
217         }
218         else
219         {
220             auto filled = entryChunks.front;
221             avail[0 .. filled.length] = filled;
222             avail = avail[filled.length .. $];
223             entryChunks.popFront();
224             if (entryChunks.empty)
225             {
226                 const pad = avail.length % 512;
227                 avail[0 .. pad] = 0;
228                 avail = avail[pad .. $];
229             }
230         }
231     }
232 }
233 
234 static assert(isByteRange!(TarBox!(BoxEntry[])));
235 
236 /// Returns a range of entries from a `.tar`, `.tar.gz`, `.tar.bz2` or `.tar.xz` formatted byte range
237 auto unboxTar(I)(I input) if (isByteRange!I)
238 {
239     auto dataInput = new ByteRangeCursor!I(input);
240     return TarUnbox(dataInput);
241 }
242 
243 /// ditto
244 auto unboxTarGz(I)(I input)
245 {
246     return input.inflateGz().unboxTar();
247 }
248 
249 version (HaveSquizBzip2)
250 {
251     /// ditto
252     auto unboxTarBzip2(I)(I input)
253     {
254         return input.decompressBzip2().unboxTar();
255     }
256 }
257 
258 version (HaveSquizLzma)
259 {
260     /// ditto
261     auto unboxTarXz(I)(I input)
262     {
263         return input.decompressXz().unboxTar();
264     }
265 }
266 
267 private struct TarUnbox
268 {
269     private Cursor _input;
270 
271     // current header data
272     private size_t _next;
273     private ubyte[] _block;
274     private UnboxEntry _entry;
275 
276     this(Cursor input)
277     {
278         _input = input;
279         _block = new ubyte[512];
280 
281         // file with zero bytes is a valid tar file
282         if (!_input.eoi)
283             readHeaderBlock();
284     }
285 
286     @property bool empty()
287     {
288         return _input.eoi;
289     }
290 
291     @property UnboxEntry front()
292     {
293         return _entry;
294     }
295 
296     void popFront()
297     {
298         assert(_input.pos <= _next);
299 
300         if (_input.pos < _next)
301         {
302             // the current entry was not fully read, we move the stream forward
303             // up to the next header
304             const dist = _next - _input.pos;
305             _input.ffw(dist);
306         }
307         readHeaderBlock();
308     }
309 
310     private void readHeaderBlock()
311     {
312         import std.conv : to;
313 
314         enforce(_input.read(_block).length == 512, "Unexpected end of input");
315 
316         TarHeader* th = cast(TarHeader*) _block.ptr;
317 
318         const computed = th.unsignedChecksum();
319         const checksum = parseOctalString(th.chksum);
320 
321         if (computed == 256 && checksum == 0)
322         {
323             // this is an empty header (only zeros)
324             // indicates end of archive
325 
326             while (!_input.eoi)
327             {
328                 _input.ffw(512);
329             }
330             return;
331         }
332 
333         enforce(
334             checksum == computed,
335             "Invalid TAR checksum at 0x" ~ (
336                 _input.pos - 512 + th.chksum.offsetof)
337                 .to!string(16) ~
338                 "\nExpected " ~ computed.to!string ~ " but found " ~ checksum.to!string,
339         );
340 
341         if (th.typeflag == Typeflag.posixExtended || th.typeflag == Typeflag.extended)
342         {
343             // skipping extended Tar headers
344             const sz = next512(parseOctalString!size_t(th.size));
345             _input.ffw(sz);
346             readHeaderBlock();
347             return;
348         }
349 
350         TarEntryInfo info;
351         info.path = (parseString(th.prefix) ~ parseString(th.name)).idup;
352         info.type = toEntryType(th.typeflag);
353         info.linkname = parseString(th.linkname).idup;
354         info.size = parseOctalString!size_t(th.size);
355         info.entrySize = 512 + next512(info.size);
356         info.timeLastModified = SysTime(unixTimeToStdTime(parseOctalString!ulong(th.mtime)));
357         version (Posix)
358         {
359             // tar mode contains stat.st_mode & 07777.
360             // we have to add the missing flags corresponding to file type
361             // (and by no way tar mode is meaningful on Windows)
362             const filetype = posixModeFileType(th.typeflag);
363             info.attributes = parseOctalString(th.mode) | filetype;
364             info.ownerId = parseOctalString(th.uid);
365             info.groupId = parseOctalString(th.gid);
366         }
367 
368         _entry = new TarUnboxEntry(_input, info);
369 
370         _next = next512(_input.pos + info.size);
371     }
372 }
373 
374 static assert(isUnboxEntryRange!TarUnbox);
375 
376 struct TarEntryInfo
377 {
378     string path;
379     string linkname;
380     EntryType type;
381     ulong size;
382     ulong entrySize;
383     SysTime timeLastModified;
384     uint attributes;
385 
386     version (Posix)
387     {
388         int ownerId;
389         int groupId;
390     }
391 }
392 
393 private class TarUnboxEntry : UnboxEntry
394 {
395     import std.stdio : File;
396 
397     private Cursor _input;
398     private size_t _start;
399     private size_t _end;
400     private TarEntryInfo _info;
401 
402     this(Cursor input, TarEntryInfo info)
403     {
404         _input = input;
405         _start = input.pos;
406         _end = _start + info.size;
407         _info = info;
408     }
409 
410     @property EntryMode mode()
411     {
412         return EntryMode.extraction;
413     }
414 
415     @property string path()
416     {
417         return _info.path;
418     }
419 
420     @property EntryType type()
421     {
422         return _info.type;
423     }
424 
425     @property string linkname()
426     {
427         return _info.linkname;
428     }
429 
430     @property size_t size()
431     {
432         return _info.size;
433     }
434 
435     @property size_t entrySize()
436     {
437         return _info.entrySize;
438     }
439 
440     @property SysTime timeLastModified()
441     {
442         return _info.timeLastModified;
443     }
444 
445     @property uint attributes()
446     {
447         return _info.attributes;
448     }
449 
450     version (Posix)
451     {
452         @property int ownerId()
453         {
454             return _info.ownerId;
455         }
456 
457         @property int groupId()
458         {
459             return _info.groupId;
460         }
461     }
462 
463     ByteRange byChunk(size_t chunkSize)
464     {
465         import std.range.interfaces : inputRangeObject;
466 
467         enforce(
468             _input.pos == _start,
469             "Data cursor has moved, this entry is not valid anymore"
470         );
471         return inputRangeObject(cursorByteRange(_input, _end - _input.pos, chunkSize));
472     }
473 }
474 
475 private struct TarHeader
476 {
477     // dfmt off
478     char [100]  name;       //   0    0
479     char [8]    mode;       // 100   64
480     char [8]    uid;        // 108   6C
481     char [8]    gid;        // 116   74
482     char [12]   size;       // 124   7C
483     char [12]   mtime;      // 136   88
484     char [8]    chksum;     // 148   94
485     Typeflag    typeflag;   // 156   9C
486     char [100]  linkname;   // 157   9D
487     char [6]    magic;      // 257  101
488     char [2]    version_;   // 263  107
489     char [32]   uname;      // 265  109
490     char [32]   gname;      // 297  129
491     char [8]    devmajor;   // 329  149
492     char [8]    devminor;   // 337  151
493     char [155]  prefix;     // 345  159
494     char [12]   padding;    // 500  1F4
495     //dfmt on
496 
497     private static ubyte[] fillWith(ArchiveEntry file, ubyte[] block)
498     in (block.length >= 512)
499     {
500         import std.algorithm : min;
501         import std.string : toStringz;
502 
503         version (Posix)
504         {
505             char[512] buf;
506         }
507 
508         block[0 .. 512] = 0;
509 
510         TarHeader* th = cast(TarHeader*)(&block[0]);
511 
512         // prefix and name
513         const name = file.path;
514         const prefLen = name.length > 100 ? cast(ptrdiff_t) name.length - 100 : 0;
515         if (prefLen)
516             th.prefix[0 .. prefLen] = name[0 .. prefLen];
517         th.name[0 .. name.length - prefLen] = name[prefLen .. $];
518 
519         th.typeflag = toTypeflag(file.type);
520 
521         if (th.typeflag == Typeflag.symLink)
522         {
523             const lname = file.linkname;
524             const len = min(lname.length, cast(ptrdiff_t) th.linkname.length - 1);
525             th.linkname[0 .. len] = lname[0 .. len];
526         }
527 
528         version (Posix)
529         {
530             import core.sys.posix.grp;
531             import core.sys.posix.pwd;
532             import core.stdc.string : strlen;
533             import std.conv : octal;
534 
535             const uid = file.ownerId;
536             const gid = file.groupId;
537 
538             toOctalString(file.attributes & octal!7777, th.mode[0 .. $ - 1]);
539             toOctalString(uid, th.uid[0 .. $ - 1]);
540             toOctalString(gid, th.gid[0 .. $ - 1]);
541 
542             if (uid != 0)
543             {
544                 passwd pwdbuf;
545                 passwd* pwd;
546                 enforce(getpwuid_r(uid, &pwdbuf, buf.ptr, buf.length, &pwd) == 0, "Could not read user name");
547                 const urlen = min(strlen(pwd.pw_name), th.uname.length);
548                 th.uname[0 .. urlen] = pwd.pw_name[0 .. urlen];
549             }
550 
551             if (gid != 0)
552             {
553                 group grpbuf;
554                 group* grp;
555                 enforce(getgrgid_r(gid, &grpbuf, buf.ptr, buf.length, &grp) == 0, "Could not read group name");
556                 const grlen = min(strlen(grp.gr_name), th.gname.length);
557                 th.gname[0 .. grlen] = grp.gr_name[0 .. grlen];
558             }
559         }
560         else version (Windows)
561         {
562             // default to mode 644 which is the most common on UNIX
563             th.mode[0 .. 7] = "0000644";
564 
565             // TODO: https://docs.microsoft.com/fr-fr/windows/win32/secauthz/finding-the-owner-of-a-file-object-in-c--
566         }
567 
568         toOctalString(file.size, th.size[0 .. $ - 1]);
569         const mtime = file.timeLastModified().toUnixTime!long();
570         toOctalString(mtime, th.mtime[0 .. $ - 1]);
571 
572         th.magic = "ustar\0";
573         th.version_ = "00";
574 
575         const chksum = th.unsignedChecksum();
576 
577         toOctalString(chksum, th.chksum[0 .. $ - 1]);
578 
579         return block[512 .. $];
580     }
581 
582     private uint unsignedChecksum()
583     {
584         uint sum = 0;
585         sum += unsignedSum(name);
586         sum += unsignedSum(mode);
587         sum += unsignedSum(uid);
588         sum += unsignedSum(gid);
589         sum += unsignedSum(size);
590         sum += unsignedSum(mtime);
591         sum += 32 * 8;
592         sum += cast(uint) typeflag;
593         sum += unsignedSum(linkname);
594         sum += unsignedSum(magic);
595         sum += unsignedSum(version_);
596         sum += unsignedSum(uname);
597         sum += unsignedSum(gname);
598         sum += unsignedSum(devmajor);
599         sum += unsignedSum(devminor);
600         sum += unsignedSum(prefix);
601         return sum;
602     }
603 }
604 
605 static assert(TarHeader.sizeof == 512);
606 
607 private enum Typeflag : ubyte
608 {
609     normalNul = 0,
610     normal = '0',
611     hardLink = '1',
612     symLink = '2',
613     charSpecial = '3',
614     blockSpecial = '4',
615     directory = '5',
616     fifo = '6',
617     contiguousFile = '7',
618     posixExtended = 'g',
619     extended = 'x',
620 }
621 
622 Typeflag toTypeflag(EntryType type)
623 {
624     final switch (type)
625     {
626     case EntryType.regular:
627         return Typeflag.normal;
628     case EntryType.directory:
629         return Typeflag.directory;
630     case EntryType.symlink:
631         return Typeflag.symLink;
632     }
633 }
634 
635 EntryType toEntryType(Typeflag flag)
636 {
637     switch (flag)
638     {
639     case Typeflag.directory:
640         return EntryType.directory;
641     case Typeflag.symLink:
642         return EntryType.symlink;
643     default:
644         return EntryType.regular;
645     }
646 }
647 
648 version (Posix)
649 {
650     // stat.st_mode part corresponding to file type
651     uint posixModeFileType(Typeflag flag)
652     {
653         import std.conv : octal;
654         import std.format : format;
655 
656         switch (flag)
657         {
658         case Typeflag.normalNul:
659         case Typeflag.normal:
660             return octal!100_000;
661         case Typeflag.hardLink:
662             // is regular file right for hard links?
663             return octal!100_000;
664         case Typeflag.symLink:
665             return octal!120_000;
666         case Typeflag.charSpecial:
667             return octal!20_000;
668         case Typeflag.blockSpecial:
669             return octal!60_000;
670         case Typeflag.directory:
671             return octal!40_000;
672         case Typeflag.fifo:
673             return octal!10_000;
674         case Typeflag.contiguousFile:
675             // is regular file right for contiguous files?
676             return octal!100_000;
677         default:
678             throw new Exception(format!"Unexpected Tar entry type: '%s'"(cast(char) flag));
679         }
680     }
681 }
682 
683 private uint unsignedSum(const(char)[] buf)
684 {
685     uint sum;
686     foreach (ubyte b; cast(const(ubyte)[]) buf)
687     {
688         sum += cast(uint) b;
689     }
690     return sum;
691 }
692 
693 private void toOctalString(T)(T val, char[] buf)
694 {
695     import std.format : sformat;
696 
697     sformat(buf, "%0*o", buf.length, val);
698 }
699 
700 private T parseOctalString(T = uint)(const(char)[] octal)
701 {
702     import std.algorithm : countUntil;
703     import std.conv : parse;
704     import std.range : retro;
705 
706     size_t nuls = retro(octal).countUntil!(c => c != '\0');
707 
708     if (nuls == octal.length || nuls == -1)
709         return 0;
710 
711     auto src = octal[0 .. $ - nuls];
712 
713     return parse!(T)(src, 8);
714 }
715 
716 private char[] parseString(char[] chars)
717 {
718     import core.stdc.string : strlen;
719 
720     const len = strlen(chars.ptr);
721     return chars[0 .. len];
722 }
723 
724 private size_t next512(size_t off)
725 {
726     const rem = off % 512;
727     if (rem == 0)
728         return off;
729     return off + 512 - rem;
730 }
731 
732 @("next512")
733 unittest
734 {
735     assert(next512(0) == 0);
736     assert(next512(1) == 512);
737     assert(next512(300) == 512);
738     assert(next512(511) == 512);
739     assert(next512(512) == 512);
740     assert(next512(1024) == 1024);
741     assert(next512(1025) == 1536);
742     assert(next512(1225) == 1536);
743     assert(next512(1535) == 1536);
744     assert(next512(1536) == 1536);
745 }