1 module squiz_box.box.tar;
2 
3 import squiz_box.box;
4 import squiz_box.priv;
5 import squiz_box.squiz;
6 
7 import std.datetime.systime;
8 import std.exception;
9 import std.path;
10 import std.range.primitives;
11 
12 /// Returns a Tar archive as a byte range
13 /// corresponding to the entries in input.
14 /// chunkSize must be a multiple of 512.
15 auto createTarArchive(I)(I entries, size_t chunkSize = defaultChunkSize)
16         if (isCreateEntryRange!I)
17 in (chunkSize >= 512 && chunkSize % 512 == 0)
18 {
19     return TarArchiveCreate!I(entries, chunkSize);
20 }
21 
22 private struct TarArchiveCreate(I)
23 {
24     // init data
25     I entriesInput;
26     ubyte[] buffer;
27 
28     // current chunk (front data)
29     ubyte[] chunk; // data ready
30     ubyte[] avail; // space available in buffer (after chunk)
31 
32     // current entry being processed
33     ArchiveCreateEntry entry;
34     ByteRange entryChunks;
35 
36     // footer is two empty blocks
37     size_t footer;
38     enum footerLen = 1024;
39 
40     this(I entries, size_t chunkSize)
41     {
42         enforce(chunkSize % 512 == 0, "chunk size must be a multiple of 512");
43         entriesInput = entries;
44         buffer = new ubyte[chunkSize];
45         avail = buffer;
46         popFront();
47     }
48 
49     @property bool empty()
50     {
51         // handle .init
52         if (!buffer)
53             return true;
54 
55         // more files to be processed
56         if (!entriesInput.empty)
57             return false;
58 
59         // current entry not exhausted
60         if (hasEntryChunks())
61             return false;
62 
63         // some unconsumed flying data
64         if (chunk.length)
65             return false;
66 
67         return true;
68     }
69 
70     @property ByteChunk front()
71     {
72         return chunk;
73     }
74 
75     void popFront()
76     {
77         if (!moreToRead())
78         {
79             if (footer >= footerLen)
80             {
81                 chunk = null;
82             }
83             else
84             {
85                 import std.algorithm : min;
86 
87                 const len = min(buffer.length, footerLen - footer);
88                 buffer[0 .. len] = 0;
89                 chunk = buffer[0 .. len];
90                 footer += len;
91             }
92             return;
93         }
94 
95         while (avail.length && moreToRead)
96         {
97             nextBlock();
98             chunk = buffer[0 .. $ - avail.length];
99         }
100         avail = buffer;
101     }
102 
103     private bool hasEntryChunks()
104     {
105         return entryChunks && !entryChunks.empty;
106     }
107 
108     private bool moreToRead()
109     {
110         return !entriesInput.empty || hasEntryChunks();
111     }
112 
113     private void nextBlock()
114     in (avail.length >= 512)
115     {
116         if (!entry || !hasEntryChunks())
117         {
118             enforce(!entriesInput.empty);
119             entry = entriesInput.front;
120             entriesInput.popFront();
121             avail = TarHeader.fillWith(entry, avail);
122             entryChunks = entry.byChunk(512);
123         }
124         else
125         {
126             auto filled = entryChunks.front;
127             avail[0 .. filled.length] = filled;
128             avail = avail[filled.length .. $];
129             entryChunks.popFront();
130             if (entryChunks.empty)
131             {
132                 const pad = avail.length % 512;
133                 avail[0 .. pad] = 0;
134                 avail = avail[pad .. $];
135             }
136         }
137     }
138 }
139 
140 static assert(isByteRange!(TarArchiveCreate!(ArchiveCreateEntry[])));
141 
142 /// Return a range of entries from a Tar formatted byte range
143 auto readTarArchive(I)(I tarInput) if (isByteRange!I)
144 {
145     auto dataInput = new ByteRangeCursor!I(tarInput);
146     return ArchiveTarRead(dataInput);
147 }
148 
149 private struct ArchiveTarRead
150 {
151     private Cursor _input;
152 
153     // current header data
154     private size_t _next;
155     private ubyte[] _block;
156     private ArchiveExtractEntry _entry;
157 
158     this(Cursor input)
159     {
160         _input = input;
161         _block = new ubyte[512];
162 
163         // file with zero bytes is a valid tar file
164         if (!_input.eoi)
165             readHeaderBlock();
166     }
167 
168     @property bool empty()
169     {
170         return _input.eoi;
171     }
172 
173     @property ArchiveExtractEntry front()
174     {
175         return _entry;
176     }
177 
178     void popFront()
179     {
180         assert(_input.pos <= _next);
181 
182         if (_input.pos < _next)
183         {
184             // the current entry was not fully read, we move the stream forward
185             // up to the next header
186             const dist = _next - _input.pos;
187             _input.ffw(dist);
188         }
189         readHeaderBlock();
190     }
191 
192     private void readHeaderBlock()
193     {
194         import std.conv : to;
195 
196         enforce(_input.read(_block).length == 512, "Unexpected end of input");
197 
198         TarHeader* th = cast(TarHeader*) _block.ptr;
199 
200         const computed = th.unsignedChecksum();
201         const checksum = parseOctalString(th.chksum);
202 
203         if (computed == 256 && checksum == 0)
204         {
205             // this is an empty header (only zeros)
206             // indicates end of archive
207 
208             while (!_input.eoi)
209             {
210                 _input.ffw(512);
211             }
212             return;
213         }
214 
215         enforce(
216             checksum == computed,
217             "Invalid TAR checksum at 0x" ~ (
218                 _input.pos - 512 + th.chksum.offsetof)
219                 .to!string(16) ~
220                 "\nExpected " ~ computed.to!string ~ " but found " ~ checksum.to!string,
221         );
222 
223         TarEntryInfo info;
224         info.path = (parseString(th.prefix) ~ parseString(th.name)).idup;
225         info.type = toEntryType(th.typeflag);
226         info.linkname = parseString(th.linkname).idup;
227         info.size = parseOctalString!size_t(th.size);
228         info.entrySize = 512 + next512(info.size);
229         info.timeLastModified = SysTime(unixTimeToStdTime(parseOctalString!ulong(th.mtime)));
230         version (Posix)
231         {
232             // tar mode contains stat.st_mode & 07777.
233             // we have to add the missing flags corresponding to file type
234             // (and by no way tar mode is meaningful on Windows)
235             const filetype = posixModeFileType(th.typeflag);
236             info.attributes = parseOctalString(th.mode) | filetype;
237             info.ownerId = parseOctalString(th.uid);
238             info.groupId = parseOctalString(th.gid);
239         }
240 
241         _entry = new ArchiveTarExtractEntry(_input, info);
242 
243         _next = next512(_input.pos + info.size);
244     }
245 }
246 
247 static assert(isExtractEntryRange!ArchiveTarRead);
248 
249 struct TarEntryInfo
250 {
251     string path;
252     string linkname;
253     EntryType type;
254     ulong size;
255     ulong entrySize;
256     SysTime timeLastModified;
257     uint attributes;
258 
259     version (Posix)
260     {
261         int ownerId;
262         int groupId;
263     }
264 }
265 
266 private class ArchiveTarExtractEntry : ArchiveExtractEntry
267 {
268     import std.stdio : File;
269 
270     private Cursor _input;
271     private size_t _start;
272     private size_t _end;
273     private TarEntryInfo _info;
274 
275     this(Cursor input, TarEntryInfo info)
276     {
277         _input = input;
278         _start = input.pos;
279         _end = _start + info.size;
280         _info = info;
281     }
282 
283     @property EntryMode mode()
284     {
285         return EntryMode.extraction;
286     }
287 
288     @property string path()
289     {
290         return _info.path;
291     }
292 
293     @property EntryType type()
294     {
295         return _info.type;
296     }
297 
298     @property string linkname()
299     {
300         return _info.linkname;
301     }
302 
303     @property size_t size()
304     {
305         return _info.size;
306     }
307 
308     @property size_t entrySize()
309     {
310         return _info.entrySize;
311     }
312 
313     @property SysTime timeLastModified()
314     {
315         return _info.timeLastModified;
316     }
317 
318     @property uint attributes()
319     {
320         return _info.attributes;
321     }
322 
323     version (Posix)
324     {
325         @property int ownerId()
326         {
327             return _info.ownerId;
328         }
329 
330         @property int groupId()
331         {
332             return _info.groupId;
333         }
334     }
335 
336     ByteRange byChunk(size_t chunkSize)
337     {
338         import std.range.interfaces : inputRangeObject;
339 
340         enforce(
341             _input.pos == _start,
342             "Data cursor has moved, this entry is not valid anymore"
343         );
344         return inputRangeObject(cursorByteRange(_input, _end - _input.pos, chunkSize));
345     }
346 }
347 
348 private struct TarHeader
349 {
350     // dfmt off
351     char [100]  name;       //   0    0
352     char [8]    mode;       // 100   64
353     char [8]    uid;        // 108   6C
354     char [8]    gid;        // 116   74
355     char [12]   size;       // 124   7C
356     char [12]   mtime;      // 136   88
357     char [8]    chksum;     // 148   94
358     Typeflag    typeflag;   // 156   9C
359     char [100]  linkname;   // 157   9D
360     char [6]    magic;      // 257  101
361     char [2]    version_;   // 263  107
362     char [32]   uname;      // 265  109
363     char [32]   gname;      // 297  129
364     char [8]    devmajor;   // 329  149
365     char [8]    devminor;   // 337  151
366     char [155]  prefix;     // 345  159
367     char [12]   padding;    // 500  1F4
368     //dfmt on
369 
370     private static ubyte[] fillWith(ArchiveEntry file, ubyte[] block)
371     in (block.length >= 512)
372     {
373         import std.algorithm : min;
374         import std.string : toStringz;
375 
376         version (Posix)
377         {
378             char[512] buf;
379         }
380 
381         block[0 .. 512] = 0;
382 
383         TarHeader* th = cast(TarHeader*)(&block[0]);
384 
385         // prefix and name
386         const name = file.path;
387         const prefLen = name.length > 100 ? cast(ptrdiff_t) name.length - 100 : 0;
388         if (prefLen)
389             th.prefix[0 .. prefLen] = name[0 .. prefLen];
390         th.name[0 .. name.length - prefLen] = name[prefLen .. $];
391 
392         th.typeflag = toTypeflag(file.type);
393 
394         if (th.typeflag == Typeflag.symLink)
395         {
396             const lname = file.linkname;
397             const len = min(lname.length, cast(ptrdiff_t) th.linkname.length - 1);
398             th.linkname[0 .. len] = lname[0 .. len];
399         }
400 
401         version (Posix)
402         {
403             import core.sys.posix.grp;
404             import core.sys.posix.pwd;
405             import core.stdc.string : strlen;
406             import std.conv : octal;
407 
408             const uid = file.ownerId;
409             const gid = file.groupId;
410 
411             toOctalString(file.attributes & octal!7777, th.mode[0 .. $ - 1]);
412             toOctalString(uid, th.uid[0 .. $ - 1]);
413             toOctalString(gid, th.gid[0 .. $ - 1]);
414 
415             if (uid != 0)
416             {
417                 passwd pwdbuf;
418                 passwd* pwd;
419                 enforce(getpwuid_r(uid, &pwdbuf, buf.ptr, buf.length, &pwd) == 0, "Could not read user name");
420                 const urlen = min(strlen(pwd.pw_name), th.uname.length);
421                 th.uname[0 .. urlen] = pwd.pw_name[0 .. urlen];
422             }
423 
424             if (gid != 0)
425             {
426                 group grpbuf;
427                 group* grp;
428                 enforce(getgrgid_r(gid, &grpbuf, buf.ptr, buf.length, &grp) == 0, "Could not read group name");
429                 const grlen = min(strlen(grp.gr_name), th.gname.length);
430                 th.gname[0 .. grlen] = grp.gr_name[0 .. grlen];
431             }
432         }
433         else version (Windows)
434         {
435             // default to mode 644 which is the most common on UNIX
436             th.mode[0 .. 7] = "0000644";
437 
438             // TODO: https://docs.microsoft.com/fr-fr/windows/win32/secauthz/finding-the-owner-of-a-file-object-in-c--
439         }
440 
441         toOctalString(file.size, th.size[0 .. $ - 1]);
442         const mtime = file.timeLastModified().toUnixTime!long();
443         toOctalString(mtime, th.mtime[0 .. $ - 1]);
444 
445         th.magic = "ustar\0";
446         th.version_ = "00";
447 
448         const chksum = th.unsignedChecksum();
449 
450         toOctalString(chksum, th.chksum[0 .. $ - 1]);
451 
452         return block[512 .. $];
453     }
454 
455     private uint unsignedChecksum()
456     {
457         uint sum = 0;
458         sum += unsignedSum(name);
459         sum += unsignedSum(mode);
460         sum += unsignedSum(uid);
461         sum += unsignedSum(gid);
462         sum += unsignedSum(size);
463         sum += unsignedSum(mtime);
464         sum += 32 * 8;
465         sum += cast(uint) typeflag;
466         sum += unsignedSum(linkname);
467         sum += unsignedSum(magic);
468         sum += unsignedSum(version_);
469         sum += unsignedSum(uname);
470         sum += unsignedSum(gname);
471         sum += unsignedSum(devmajor);
472         sum += unsignedSum(devminor);
473         sum += unsignedSum(prefix);
474         return sum;
475     }
476 }
477 
478 static assert(TarHeader.sizeof == 512);
479 
480 private enum Typeflag : ubyte
481 {
482     normalNul = 0,
483     normal = '0',
484     hardLink = '1',
485     symLink = '2',
486     charSpecial = '3',
487     blockSpecial = '4',
488     directory = '5',
489     fifo = '6',
490     contiguousFile = '7',
491 }
492 
493 Typeflag toTypeflag(EntryType type)
494 {
495     final switch (type)
496     {
497     case EntryType.regular:
498         return Typeflag.normal;
499     case EntryType.directory:
500         return Typeflag.directory;
501     case EntryType.symlink:
502         return Typeflag.symLink;
503     }
504 }
505 
506 EntryType toEntryType(Typeflag flag)
507 {
508     switch (flag)
509     {
510     case Typeflag.directory:
511         return EntryType.directory;
512     case Typeflag.symLink:
513         return EntryType.symlink;
514     default:
515         return EntryType.regular;
516     }
517 }
518 
519 version (Posix)
520 {
521     // stat.st_mode part corresponding to file type
522     uint posixModeFileType(Typeflag flag)
523     {
524         import std.conv : octal;
525 
526         final switch (flag)
527         {
528         case Typeflag.normalNul:
529         case Typeflag.normal:
530             return octal!100_000;
531         case Typeflag.hardLink:
532             // is regular file right for hard links?
533             return octal!100_000;
534         case Typeflag.symLink:
535             return octal!120_000;
536         case Typeflag.charSpecial:
537             return octal!20_000;
538         case Typeflag.blockSpecial:
539             return octal!60_000;
540         case Typeflag.directory:
541             return octal!40_000;
542         case Typeflag.fifo:
543             return octal!10_000;
544         case Typeflag.contiguousFile:
545             // is regular file right for contiguous files?
546             return octal!100_000;
547         }
548     }
549 }
550 
551 private uint unsignedSum(const(char)[] buf)
552 {
553     uint sum;
554     foreach (ubyte b; cast(const(ubyte)[]) buf)
555     {
556         sum += cast(uint) b;
557     }
558     return sum;
559 }
560 
561 private void toOctalString(T)(T val, char[] buf)
562 {
563     import std.format : sformat;
564 
565     sformat(buf, "%0*o", buf.length, val);
566 }
567 
568 private T parseOctalString(T = uint)(const(char)[] octal)
569 {
570     import std.algorithm : countUntil;
571     import std.conv : parse;
572     import std.range : retro;
573 
574     size_t nuls = retro(octal).countUntil!(c => c != '\0');
575 
576     if (nuls == octal.length || nuls == -1)
577         return 0;
578 
579     auto src = octal[0 .. $ - nuls];
580 
581     return parse!(T)(src, 8);
582 }
583 
584 private char[] parseString(char[] chars)
585 {
586     import core.stdc.string : strlen;
587 
588     const len = strlen(chars.ptr);
589     return chars[0 .. len];
590 }
591 
592 private size_t next512(size_t off)
593 {
594     const rem = off % 512;
595     if (rem == 0)
596         return off;
597     return off + 512 - rem;
598 }
599 
600 @("next512")
601 unittest
602 {
603     assert(next512(0) == 0);
604     assert(next512(1) == 512);
605     assert(next512(300) == 512);
606     assert(next512(511) == 512);
607     assert(next512(512) == 512);
608     assert(next512(1024) == 1024);
609     assert(next512(1025) == 1536);
610     assert(next512(1225) == 1536);
611     assert(next512(1535) == 1536);
612     assert(next512(1536) == 1536);
613 }