001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.digest;
019
020import java.io.ByteArrayOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.nio.charset.StandardCharsets;
024import java.nio.file.DirectoryStream;
025import java.nio.file.Files;
026import java.nio.file.Path;
027import java.security.MessageDigest;
028import java.util.HashMap;
029import java.util.Map;
030import java.util.Objects;
031import java.util.Set;
032import java.util.TreeSet;
033import java.util.function.Supplier;
034
035/**
036 * Computes <a href="https://git-scm.com/">Git</a> object identifiers and their generalizations described by the
037 * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>.
038 *
039 * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git.
040 * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p>
041 *
042 * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p>
043 *
044 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
045 * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a>
046 * @since 1.22.0
047 */
048public class GitIdentifiers {
049
050    /**
051     * Represents a single entry in a Git tree object.
052     *
053     * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
054     * <ul>
055     *   <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
056     *   <li>the entry name (file or directory name, without a path separator),</li>
057     *   <li>the raw object id of the referenced blob or sub-tree.</li>
058     * </ul>
059     *
060     * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
061     * sorts after {@code foobar}.</p>
062     *
063     * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
064     * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
065     */
066    static class DirectoryEntry implements Comparable<DirectoryEntry> {
067
068        /**
069         * The entry name (file or directory name, no path separator).
070         */
071        private final String name;
072
073        /**
074         * The raw object id of the referenced blob or sub-tree.
075         */
076        private final byte[] rawObjectId;
077
078        /**
079         * The key used for ordering entries within a tree object.
080         *
081         * <p>>Git appends {@code '/'} to directory names before comparing.</p>
082         */
083        private final String sortKey;
084
085        /**
086         * The Git object type, which determines the Unix file-mode prefix.
087         */
088        private final FileMode type;
089
090        /**
091         * Constructs a new entry.
092         *
093         * @param name The name of the entry, not containing {@code '/'}.
094         * @param type The type of the entry, not null.
095         * @param rawObjectId The id of the entry, not null.
096         */
097        DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) {
098            if (Objects.requireNonNull(name).indexOf('/') >= 0) {
099                throw new IllegalArgumentException("Entry name must not contain '/': " + name);
100            }
101            this.name = name;
102            this.type = Objects.requireNonNull(type);
103            this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
104            this.rawObjectId = Objects.requireNonNull(rawObjectId);
105        }
106
107        @Override
108        public int compareTo(final DirectoryEntry o) {
109            return sortKey.compareTo(o.sortKey);
110        }
111
112        @Override
113        public boolean equals(final Object obj) {
114            if (obj == this) {
115                return true;
116            }
117            if (!(obj instanceof DirectoryEntry)) {
118                return false;
119            }
120            final DirectoryEntry other = (DirectoryEntry) obj;
121            return name.equals(other.name);
122        }
123
124        @Override
125        public int hashCode() {
126            return name.hashCode();
127        }
128
129    }
130
131    /**
132     * The type of a Git tree entry, which maps to a Unix file-mode string.
133     *
134     * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
135     * cover the four entry types that Git itself produces.</p>
136     *
137     * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
138     */
139    public enum FileMode {
140
141        /**
142         * A subdirectory. Subdirectories can only be specified by SHA or through a tree mark set with {@code --import-marks}.
143         *
144         * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
145         */
146        DIRECTORY(new byte[] { '4', '0', '0', '0', '0' }),
147
148        /**
149         * A regular, but executable, file.
150         */
151        EXECUTABLE(new byte[] { '1', '0', '0', '7', '5', '5' }),
152
153        /**
154         * A gitlink, SHA-1 of the object refers to a commit in another repository. Git links can only be specified either by SHA or through a commit mark. They
155         * are used to implement submodules.
156         *
157         * @see <a href="https://git-scm.com/docs/gitdatamodel">gitdatamodel - Git&#39;s core data model</a>
158         * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
159         */
160        GIT_LINK(new byte[] { '1', '6', '0', '0', '0', '0' }),
161
162        /**
163         * A regular (non-executable) file.
164         * <p>
165         * The majority of files in most projects use this mode. If in doubt, this is what you want.
166         * </p>
167         */
168        REGULAR(new byte[] { '1', '0', '0', '6', '4', '4' }),
169
170        /**
171         * A symbolic link. The content of the file will be the link target.
172         */
173        SYMBOLIC_LINK(new byte[] { '1', '2', '0', '0', '0', '0' });
174
175        private static FileMode get(final Path path) {
176            // Symbolic links first
177            if (Files.isSymbolicLink(path)) {
178                return SYMBOLIC_LINK;
179            }
180            if (Files.isDirectory(path)) {
181                return DIRECTORY;
182            }
183            if (Files.isExecutable(path)) {
184                return EXECUTABLE;
185            }
186            return REGULAR;
187        }
188
189        /**
190         * Serialized {@code mode}: since this is mutable, it must remain private.
191         */
192        private final byte[] modeBytes;
193
194        FileMode(final byte[] modeBytes) {
195            // No need for a defensive copy since the array is private and never exposed,
196            this.modeBytes = modeBytes;
197        }
198    }
199
200    /**
201     * Builds a Git tree identifier for a virtual directory structure, such as the contents of
202     * an archive.
203     */
204    public static final class TreeIdBuilder implements Supplier<byte[]> {
205
206        /**
207         * Supplies a blob identifier that may throw {@link IOException}.
208         */
209        @FunctionalInterface
210        private interface BlobIdSupplier {
211            byte[] get() throws IOException;
212        }
213
214        private static String requireNoParentTraversal(final String name) {
215            if ("..".equals(name)) {
216                throw new IllegalArgumentException("Path component not allowed: " + name);
217            }
218            return name;
219        }
220
221        private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
222        private final Map<String, DirectoryEntry> fileEntries = new HashMap<>();
223        private final MessageDigest messageDigest;
224
225        private TreeIdBuilder(final MessageDigest messageDigest) {
226            this.messageDigest = Objects.requireNonNull(messageDigest);
227        }
228
229        /**
230         * Adds and returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent.
231         *
232         * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}).
233         * @return The {@link TreeIdBuilder} for the subdirectory.
234         * @throws IllegalArgumentException If any path component is {@code ".."}.
235         */
236        public TreeIdBuilder addDirectory(final String name) {
237            TreeIdBuilder current = this;
238            for (final String component : name.split("/", -1)) {
239                // Noop segments
240                if (component.isEmpty() || ".".equals(component)) {
241                    continue;
242                }
243                current = current.dirEntries.computeIfAbsent(requireNoParentTraversal(component), k -> new TreeIdBuilder(messageDigest));
244            }
245            return current;
246        }
247
248        private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException {
249            final int slash = name.lastIndexOf('/');
250            if (slash < 0) {
251                fileEntries.put(name, new DirectoryEntry(requireNoParentTraversal(name), mode, blobId.get()));
252            } else {
253                addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId);
254            }
255        }
256
257        /**
258         * Adds a file entry at the given path within this tree.
259         *
260         * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
261         *
262         * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
263         * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
264         * @param data The file content.
265         * @throws IOException If an I/O error occurs.
266         * @throws IllegalArgumentException If any path component is {@code ".."}.
267         */
268        public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException {
269            addFile(mode, name, () -> blobId(messageDigest, data));
270        }
271
272        /**
273         * Adds a file entry at the given path within this tree, streaming content without buffering.
274         *
275         * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
276         *
277         * <p>The stream is eagerly drained.</p>
278         *
279         * @param mode     The file mode (e.g. {@link FileMode#REGULAR}).
280         * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
281         * @param dataSize The exact number of bytes in {@code data}.
282         * @param data     The file content.
283         * @throws IOException If the stream cannot be read.
284         * @throws IllegalArgumentException If any path component is {@code ".."}.
285         */
286        public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException {
287            addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
288        }
289
290        /**
291         * Adds a symbolic link entry at the given path within this tree.
292         *
293         * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
294         *
295         * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
296         * @param target The target of the symbolic link.
297         * @throws IOException If an I/O error occurs.
298         * @throws IllegalArgumentException If any path component is {@code ".."}.
299         */
300        public void addSymbolicLink(final String name, final String target) throws IOException {
301            addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8));
302        }
303
304        /**
305         * Computes the Git tree identifier for this directory and all its descendants.
306         *
307         * @return The raw tree identifier bytes.
308         */
309        @Override
310        public byte[] get() {
311            final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values());
312            dirEntries.forEach((k, v) -> entries.add(new DirectoryEntry(k, FileMode.DIRECTORY, v.get())));
313            final ByteArrayOutputStream baos = new ByteArrayOutputStream();
314            for (final DirectoryEntry entry : entries) {
315                baos.write(entry.type.modeBytes, 0, entry.type.modeBytes.length);
316                baos.write(' ');
317                final byte[] bytes = entry.name.getBytes(StandardCharsets.UTF_8);
318                baos.write(bytes, 0, bytes.length);
319                baos.write('\0');
320                baos.write(entry.rawObjectId, 0, entry.rawObjectId.length);
321            }
322            messageDigest.reset();
323            DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size()));
324            return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest();
325        }
326
327        private TreeIdBuilder populate(final Path directory) throws IOException {
328            try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) {
329                for (final Path path : files) {
330                    final String name = Objects.toString(path.getFileName());
331                    final FileMode mode = FileMode.get(path);
332                    if (mode == FileMode.DIRECTORY) {
333                        addDirectory(name).populate(path);
334                    } else {
335                        addFile(mode, name, () -> blobId(messageDigest, path));
336                    }
337                }
338            }
339            return this;
340        }
341    }
342
343    /**
344     * Reads through a byte array and returns a generalized Git blob identifier.
345     *
346     * <p>The identifier is computed in the way described by the
347     * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
348     * algorithm.</p>
349     *
350     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
351     *
352     * @param messageDigest The MessageDigest to use (for example SHA-1).
353     * @param data          Data to digest.
354     * @return A generalized Git blob identifier.
355     */
356    public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) {
357        messageDigest.reset();
358        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
359        return DigestUtils.digest(messageDigest, data);
360    }
361
362    /**
363     * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering.
364     *
365     * <p>When the size of the content is known in advance, this overload streams {@code data} directly through
366     * the digest without buffering the full content in memory.</p>
367     *
368     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
369     *
370     * @param messageDigest The MessageDigest to use (for example SHA-1).
371     * @param dataSize      The exact number of bytes in {@code data}.
372     * @param data          Stream to digest.
373     * @return A generalized Git blob identifier.
374     * @throws IOException On error reading the stream.
375     */
376    public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException {
377        messageDigest.reset();
378        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
379        return DigestUtils.updateDigest(messageDigest, data).digest();
380    }
381
382    /**
383     * Reads through a file and returns a generalized Git blob identifier.
384     *
385     * <p>The identifier is computed in the way described by the
386     * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
387     * algorithm.</p>
388     *
389     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
390     *
391     * @param messageDigest The MessageDigest to use (for example SHA-1).
392     * @param data          Path to the file to digest.
393     * @return A generalized Git blob identifier.
394     * @throws IOException On error accessing the file.
395     */
396    public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException {
397        if (Files.isSymbolicLink(data)) {
398            final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
399            return blobId(messageDigest, linkTarget);
400        }
401        messageDigest.reset();
402        DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data)));
403        return DigestUtils.updateDigest(messageDigest, data).digest();
404    }
405
406    private static byte[] getGitBlobPrefix(final long dataSize) {
407        return getGitPrefix("blob", dataSize);
408    }
409
410    private static byte[] getGitPrefix(final String type, final long dataSize) {
411        return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
412    }
413
414    private static byte[] getGitTreePrefix(final long dataSize) {
415        return getGitPrefix("tree", dataSize);
416    }
417
418    /**
419     * Reads through a directory and returns a generalized Git tree identifier.
420     *
421     * <p>The identifier is computed in the way described by the
422     * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
423     * algorithm.</p>
424     *
425     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
426     *
427     * @param messageDigest The MessageDigest to use (for example SHA-1).
428     * @param data          Path to the directory to digest.
429     * @return A generalized Git tree identifier.
430     * @throws IOException On error accessing the directory or its contents.
431     */
432    public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException {
433        return treeIdBuilder(messageDigest).populate(data).get();
434    }
435
436    /**
437     * Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory
438     * structure, such as the contents of an archive.
439     *
440     * <p>The identifier is computed in the way described by the
441     * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
442     * algorithm.</p>
443     *
444     * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
445     *
446     * @param messageDigest The MessageDigest to use (for example SHA-1).
447     * @return A new {@link TreeIdBuilder}.
448     */
449    public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) {
450        return new TreeIdBuilder(messageDigest);
451    }
452
453    private GitIdentifiers() {
454        // utility class
455    }
456}