blob: 054ccd639b65d3e9c076ea70833e787a9d48d3e5 [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
Jeff Gastond3119522017-08-22 14:11:15 -070021 "errors"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070022 "fmt"
23 "io"
24 "os"
25 "path/filepath"
26 "runtime"
27 "sort"
28 "strings"
29 "sync"
30 "sync/atomic"
31 "time"
32
Colin Cross8d6395c2017-12-21 15:46:01 -080033 "android/soong/finder/fs"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070034)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
Lukacs T. Berkie3487c82022-05-02 10:13:19 +020097 // Whether symlinks are followed. If set, symlinks back to their own parent
98 // directory don't work.
99 FollowSymlinks bool
100
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700101 // ExcludeDirs are directory names that if encountered are removed from the search
102 ExcludeDirs []string
103
104 // PruneFiles are file names that if encountered prune their entire directory
105 // (including siblings)
106 PruneFiles []string
107
108 // IncludeFiles are file names to include as matches
109 IncludeFiles []string
Chris Parsonsa798d962020-10-12 23:44:08 -0400110
111 // IncludeSuffixes are filename suffixes to include as matches.
112 IncludeSuffixes []string
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700113}
114
115// a cacheConfig stores the inputs that determine what should be included in the cache
116type cacheConfig struct {
117 CacheParams
118
119 // FilesystemView is a unique identifier telling which parts of which file systems
120 // are readable by the Finder. In practice its value is essentially username@hostname.
121 // FilesystemView is set to ensure that a cache file copied to another host or
122 // found by another user doesn't inadvertently get reused.
123 FilesystemView string
124}
125
126func (p *cacheConfig) Dump() ([]byte, error) {
127 bytes, err := json.Marshal(p)
128 return bytes, err
129}
130
131// a cacheMetadata stores version information about the cache
132type cacheMetadata struct {
133 // The Version enables the Finder to determine whether it can even parse the file
134 // If the version changes, the entire cache file must be regenerated
135 Version string
136
137 // The CacheParams enables the Finder to determine whether the parameters match
138 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
139 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
140 Config cacheConfig
141}
142
143type Logger interface {
144 Output(calldepth int, s string) error
145}
146
147// the Finder is the main struct that callers will want to use
148type Finder struct {
149 // configuration
150 DbPath string
151 numDbLoadingThreads int
152 numSearchingThreads int
153 cacheMetadata cacheMetadata
154 logger Logger
155 filesystem fs.FileSystem
156
157 // temporary state
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700158 threadPool *threadPool
159 mutex sync.Mutex
160 fsErrs []fsErr
161 errlock sync.Mutex
162 shutdownWaitgroup sync.WaitGroup
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700163
164 // non-temporary state
165 modifiedFlag int32
166 nodes pathMap
167}
168
Jeff Gastond3119522017-08-22 14:11:15 -0700169var defaultNumThreads = runtime.NumCPU() * 2
170
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700171// New creates a new Finder for use
172func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gastonb629e182017-08-14 16:49:18 -0700173 logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gastond3119522017-08-22 14:11:15 -0700174 return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads)
175}
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700176
Jeff Gastond3119522017-08-22 14:11:15 -0700177// newImpl is like New but accepts more params
178func newImpl(cacheParams CacheParams, filesystem fs.FileSystem,
179 logger Logger, dbPath string, numThreads int) (f *Finder, err error) {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700180 numDbLoadingThreads := numThreads
181 numSearchingThreads := numThreads
182
183 metadata := cacheMetadata{
184 Version: versionString,
185 Config: cacheConfig{
186 CacheParams: cacheParams,
187 FilesystemView: filesystem.ViewId(),
188 },
189 }
190
Jeff Gastonb629e182017-08-14 16:49:18 -0700191 f = &Finder{
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700192 numDbLoadingThreads: numDbLoadingThreads,
193 numSearchingThreads: numSearchingThreads,
194 cacheMetadata: metadata,
195 logger: logger,
196 filesystem: filesystem,
197
198 nodes: *newPathMap("/"),
199 DbPath: dbPath,
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700200
201 shutdownWaitgroup: sync.WaitGroup{},
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700202 }
203
Jeff Gastonb629e182017-08-14 16:49:18 -0700204 f.loadFromFilesystem()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700205
Jeff Gastonb629e182017-08-14 16:49:18 -0700206 // check for any filesystem errors
207 err = f.getErr()
208 if err != nil {
209 return nil, err
210 }
211
212 // confirm that every path mentioned in the CacheConfig exists
213 for _, path := range cacheParams.RootDirs {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700214 if !filepath.IsAbs(path) {
215 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
216 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700217 node := f.nodes.GetNode(filepath.Clean(path), false)
218 if node == nil || node.ModTime == 0 {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700219 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gastonb629e182017-08-14 16:49:18 -0700220 }
221 }
222
223 return f, nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700224}
225
226// FindNamed searches for every cached file
227func (f *Finder) FindAll() []string {
228 return f.FindAt("/")
229}
230
231// FindNamed searches for every cached file under <rootDir>
232func (f *Finder) FindAt(rootDir string) []string {
233 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
234 return entries.DirNames, entries.FileNames
235 }
236 return f.FindMatching(rootDir, filter)
237}
238
239// FindNamed searches for every cached file named <fileName>
240func (f *Finder) FindNamed(fileName string) []string {
241 return f.FindNamedAt("/", fileName)
242}
243
244// FindNamedAt searches under <rootPath> for every file named <fileName>
245// The reason a caller might use FindNamedAt instead of FindNamed is if they want
246// to limit their search to a subset of the cache
247func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
248 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
249 matches := []string{}
250 for _, foundName := range entries.FileNames {
251 if foundName == fileName {
252 matches = append(matches, foundName)
253 }
254 }
255 return entries.DirNames, matches
256
257 }
258 return f.FindMatching(rootPath, filter)
259}
260
261// FindFirstNamed searches for every file named <fileName>
262// Whenever it finds a match, it stops search subdirectories
263func (f *Finder) FindFirstNamed(fileName string) []string {
264 return f.FindFirstNamedAt("/", fileName)
265}
266
267// FindFirstNamedAt searches for every file named <fileName>
268// Whenever it finds a match, it stops search subdirectories
269func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
270 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
271 matches := []string{}
272 for _, foundName := range entries.FileNames {
273 if foundName == fileName {
274 matches = append(matches, foundName)
275 }
276 }
277
278 if len(matches) > 0 {
279 return []string{}, matches
280 }
281 return entries.DirNames, matches
282 }
283 return f.FindMatching(rootPath, filter)
284}
285
286// FindMatching is the most general exported function for searching for files in the cache
287// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
288// in place, removing file paths and directories as desired.
289// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
290func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
291 // set up some parameters
292 scanStart := time.Now()
293 var isRel bool
294 workingDir := f.cacheMetadata.Config.WorkingDirectory
295
296 isRel = !filepath.IsAbs(rootPath)
297 if isRel {
298 rootPath = filepath.Join(workingDir, rootPath)
299 }
300
301 rootPath = filepath.Clean(rootPath)
302
303 // ensure nothing else is using the Finder
304 f.verbosef("FindMatching waiting for finder to be idle\n")
305 f.lock()
306 defer f.unlock()
307
308 node := f.nodes.GetNode(rootPath, false)
309 if node == nil {
310 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
311 rootPath, f.cacheMetadata.Config.CacheParams)
312 // path is not found; don't do a search
313 return []string{}
314 }
315
316 // search for matching files
317 f.verbosef("Finder finding %v using cache\n", rootPath)
318 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
319
320 // format and return results
321 if isRel {
322 for i := 0; i < len(results); i++ {
323 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
324 }
325 }
326 sort.Strings(results)
327 f.verbosef("Found %v files under %v in %v using cache\n",
328 len(results), rootPath, time.Since(scanStart))
329 return results
330}
331
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700332// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
333// Currently, that only entails waiting for the database dump to complete.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700334func (f *Finder) Shutdown() {
Colin Cross96e5e412020-07-06 17:13:43 -0700335 f.WaitForDbDump()
336}
337
338// WaitForDbDump returns once the database has been written to f.DbPath.
339func (f *Finder) WaitForDbDump() {
340 f.shutdownWaitgroup.Wait()
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700341}
342
343// End of public api
344
345func (f *Finder) goDumpDb() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700346 if f.wasModified() {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700347 f.shutdownWaitgroup.Add(1)
348 go func() {
349 err := f.dumpDb()
350 if err != nil {
351 f.verbosef("%v\n", err)
352 }
353 f.shutdownWaitgroup.Done()
354 }()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700355 } else {
356 f.verbosef("Skipping dumping unmodified db\n")
357 }
358}
359
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700360// joinCleanPaths is like filepath.Join but is faster because
361// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
362func joinCleanPaths(base string, leaf string) string {
363 if base == "" {
364 return leaf
365 }
366 if base == "/" {
367 return base + leaf
368 }
369 if leaf == "" {
370 return base
371 }
372 return base + "/" + leaf
373}
374
375func (f *Finder) verbosef(format string, args ...interface{}) {
376 f.logger.Output(2, fmt.Sprintf(format, args...))
377}
378
379// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
380func (f *Finder) loadFromFilesystem() {
381 f.threadPool = newThreadPool(f.numDbLoadingThreads)
382
383 err := f.startFromExternalCache()
384 if err != nil {
385 f.startWithoutExternalCache()
386 }
387
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700388 f.goDumpDb()
389
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700390 f.threadPool = nil
391}
392
393func (f *Finder) startFind(path string) {
394 if !filepath.IsAbs(path) {
395 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
396 }
397 node := f.nodes.GetNode(path, true)
398 f.statDirAsync(node)
399}
400
401func (f *Finder) lock() {
402 f.mutex.Lock()
403}
404
405func (f *Finder) unlock() {
406 f.mutex.Unlock()
407}
408
409// a statResponse is the relevant portion of the response from the filesystem to a Stat call
410type statResponse struct {
411 ModTime int64
412 Inode uint64
413 Device uint64
414}
415
416// a pathAndStats stores a path and its stats
417type pathAndStats struct {
418 statResponse
419
420 Path string
421}
422
423// a dirFullInfo stores all of the relevant information we know about a directory
424type dirFullInfo struct {
425 pathAndStats
426
427 FileNames []string
428}
429
430// a PersistedDirInfo is the information about a dir that we save to our cache on disk
431type PersistedDirInfo struct {
432 // These field names are short because they are repeated many times in the output json file
433 P string // path
434 T int64 // modification time
435 I uint64 // inode number
436 F []string // relevant filenames contained
437}
438
439// a PersistedDirs is the information that we persist for a group of dirs
440type PersistedDirs struct {
441 // the device on which each directory is stored
442 Device uint64
443 // the common root path to which all contained dirs are relative
444 Root string
445 // the directories themselves
446 Dirs []PersistedDirInfo
447}
448
449// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
450type CacheEntry []PersistedDirs
451
452// a DirEntries lists the files and directories contained directly within a specific directory
453type DirEntries struct {
454 Path string
455
456 // elements of DirNames are just the dir names; they don't include any '/' character
457 DirNames []string
458 // elements of FileNames are just the file names; they don't include '/' character
459 FileNames []string
460}
461
462// a WalkFunc is the type that is passed into various Find functions for determining which
463// directories the caller wishes be walked. The WalkFunc is expected to decide which
464// directories to walk and which files to consider as matches to the original query.
465type WalkFunc func(DirEntries) (dirs []string, files []string)
466
467// a mapNode stores the relevant stats about a directory to be stored in a pathMap
468type mapNode struct {
469 statResponse
470 FileNames []string
471}
472
473// a pathMap implements the directory tree structure of nodes
474type pathMap struct {
475 mapNode
476
477 path string
478
479 children map[string]*pathMap
480
481 // number of descendent nodes, including self
482 approximateNumDescendents int
483}
484
485func newPathMap(path string) *pathMap {
486 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
487 approximateNumDescendents: 1}
488 return result
489}
490
491// GetNode returns the node at <path>
492func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
493 if len(path) > 0 && path[0] == '/' {
494 path = path[1:]
495 }
496
497 node := m
498 for {
499 if path == "" {
500 return node
501 }
502
503 index := strings.Index(path, "/")
504 var firstComponent string
505 if index >= 0 {
506 firstComponent = path[:index]
507 path = path[index+1:]
508 } else {
509 firstComponent = path
510 path = ""
511 }
512
513 child, found := node.children[firstComponent]
514
515 if !found {
516 if createIfNotFound {
517 child = node.newChild(firstComponent)
518 } else {
519 return nil
520 }
521 }
522
523 node = child
524 }
525}
526
527func (m *pathMap) newChild(name string) (child *pathMap) {
528 path := joinCleanPaths(m.path, name)
529 newChild := newPathMap(path)
530 m.children[name] = newChild
531
532 return m.children[name]
533}
534
535func (m *pathMap) UpdateNumDescendents() int {
536 count := 1
537 for _, child := range m.children {
538 count += child.approximateNumDescendents
539 }
540 m.approximateNumDescendents = count
541 return count
542}
543
544func (m *pathMap) UpdateNumDescendentsRecursive() {
545 for _, child := range m.children {
546 child.UpdateNumDescendentsRecursive()
547 }
548 m.UpdateNumDescendents()
549}
550
551func (m *pathMap) MergeIn(other *pathMap) {
552 for key, theirs := range other.children {
553 ours, found := m.children[key]
554 if found {
555 ours.MergeIn(theirs)
556 } else {
557 m.children[key] = theirs
558 }
559 }
560 if other.ModTime != 0 {
561 m.mapNode = other.mapNode
562 }
563 m.UpdateNumDescendents()
564}
565
566func (m *pathMap) DumpAll() []dirFullInfo {
567 results := []dirFullInfo{}
568 m.dumpInto("", &results)
569 return results
570}
571
572func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
573 *results = append(*results,
574 dirFullInfo{
575 pathAndStats{statResponse: m.statResponse, Path: path},
576 m.FileNames},
577 )
578 for key, child := range m.children {
579 childPath := joinCleanPaths(path, key)
580 if len(childPath) == 0 || childPath[0] != '/' {
581 childPath = "/" + childPath
582 }
583 child.dumpInto(childPath, results)
584 }
585}
586
587// a semaphore can be locked by up to <capacity> callers at once
588type semaphore struct {
589 pool chan bool
590}
591
592func newSemaphore(capacity int) *semaphore {
593 return &semaphore{pool: make(chan bool, capacity)}
594}
595
596func (l *semaphore) Lock() {
597 l.pool <- true
598}
599
600func (l *semaphore) Unlock() {
601 <-l.pool
602}
603
604// A threadPool runs goroutines and supports throttling and waiting.
605// Without throttling, Go may exhaust the maximum number of various resources, such as
606// threads or file descriptors, and crash the program.
607type threadPool struct {
608 receivedRequests sync.WaitGroup
609 activeRequests semaphore
610}
611
612func newThreadPool(maxNumConcurrentThreads int) *threadPool {
613 return &threadPool{
614 receivedRequests: sync.WaitGroup{},
615 activeRequests: *newSemaphore(maxNumConcurrentThreads),
616 }
617}
618
619// Run requests to run the given function in its own goroutine
620func (p *threadPool) Run(function func()) {
621 p.receivedRequests.Add(1)
622 // If Run() was called from within a goroutine spawned by this threadPool,
623 // then we may need to return from Run() before having capacity to actually
624 // run <function>.
625 //
626 // It's possible that the body of <function> contains a statement (such as a syscall)
627 // that will cause Go to pin it to a thread, or will contain a statement that uses
628 // another resource that is in short supply (such as a file descriptor), so we can't
629 // actually run <function> until we have capacity.
630 //
631 // However, the semaphore used for synchronization is implemented via a channel and
632 // shouldn't require a new thread for each access.
633 go func() {
634 p.activeRequests.Lock()
635 function()
636 p.activeRequests.Unlock()
637 p.receivedRequests.Done()
638 }()
639}
640
641// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
642func (p *threadPool) Wait() {
643 p.receivedRequests.Wait()
644}
645
Jeff Gastonb629e182017-08-14 16:49:18 -0700646type fsErr struct {
647 path string
648 err error
649}
650
651func (e fsErr) String() string {
652 return e.path + ": " + e.err.Error()
653}
654
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700655func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
656 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
657 dirsByDevice := map[uint64][]PersistedDirInfo{}
658 for _, entry := range dirInfos {
659 _, found := dirsByDevice[entry.Device]
660 if !found {
661 dirsByDevice[entry.Device] = []PersistedDirInfo{}
662 }
663 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
664 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
665 }
666
667 cacheEntry := CacheEntry{}
668
669 for device, infos := range dirsByDevice {
670 // find common prefix
671 prefix := ""
672 if len(infos) > 0 {
673 prefix = infos[0].P
674 }
675 for _, info := range infos {
676 for !strings.HasPrefix(info.P+"/", prefix+"/") {
677 prefix = filepath.Dir(prefix)
Jeff Gaston996716a2017-08-22 13:33:19 -0700678 if prefix == "/" {
679 break
680 }
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700681 }
682 }
683 // remove common prefix
684 for i := range infos {
685 suffix := strings.Replace(infos[i].P, prefix, "", 1)
686 if len(suffix) > 0 && suffix[0] == '/' {
687 suffix = suffix[1:]
688 }
689 infos[i].P = suffix
690 }
691
692 // turn the map (keyed by device) into a list of structs with labeled fields
693 // this is to improve readability of the output
694 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
695 }
696
697 // convert to json.
698 // it would save some space to use a different format than json for the db file,
699 // but the space and time savings are small, and json is easy for humans to read
700 bytes, err := json.Marshal(cacheEntry)
701 return bytes, err
702}
703
704func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
705 var cacheEntry CacheEntry
706 err := json.Unmarshal(bytes, &cacheEntry)
707 if err != nil {
708 return nil, err
709 }
710
711 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
712 capacity := 0
713 for _, element := range cacheEntry {
714 capacity += len(element.Dirs)
715 }
716 nodes := make([]dirFullInfo, capacity)
717 count := 0
718 for _, element := range cacheEntry {
719 for _, dir := range element.Dirs {
720 path := joinCleanPaths(element.Root, dir.P)
721
722 nodes[count] = dirFullInfo{
723 pathAndStats: pathAndStats{
724 statResponse: statResponse{
725 ModTime: dir.T, Inode: dir.I, Device: element.Device,
726 },
727 Path: path},
728 FileNames: dir.F}
729 count++
730 }
731 }
732 return nodes, nil
733}
734
735// We use the following separator byte to distinguish individually parseable blocks of json
736// because we know this separator won't appear in the json that we're parsing.
737//
738// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
Colin Crossd079e0b2022-08-16 10:27:33 -0700739// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
740// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
741// character.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700742//
743// We know that the newline character will never appear in our json string, because:
Colin Crossd079e0b2022-08-16 10:27:33 -0700744// - If a newline character appears as part of a data string, then json encoding will
745// emit two characters instead: '\' and 'n'.
746// - The json encoder that we use doesn't emit the optional newlines between any of its
747// other outputs.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700748const lineSeparator = byte('\n')
749
750func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
751 return reader.ReadBytes(lineSeparator)
752}
753
754// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
755func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
756 cacheVersionBytes, err := f.readLine(cacheReader)
757 if err != nil {
758 f.verbosef("Failed to read database header; database is invalid\n")
759 return false
760 }
761 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
762 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
763 }
764 cacheVersionString := string(cacheVersionBytes)
765 currentVersion := f.cacheMetadata.Version
766 if cacheVersionString != currentVersion {
767 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
768 return false
769 }
770
771 cacheParamBytes, err := f.readLine(cacheReader)
772 if err != nil {
773 f.verbosef("Failed to read database search params; database is invalid\n")
774 return false
775 }
776
777 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
778 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
779 }
780
781 currentParamBytes, err := f.cacheMetadata.Config.Dump()
782 if err != nil {
783 panic("Finder failed to serialize its parameters")
784 }
785 cacheParamString := string(cacheParamBytes)
786 currentParamString := string(currentParamBytes)
787 if cacheParamString != currentParamString {
788 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
789 return false
790 }
791 return true
792}
793
794// loadBytes compares the cache info in <data> to the state of the filesystem
795// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
796func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
797
798 helperStartTime := time.Now()
799
800 cachedNodes, err := f.parseCacheEntry(data)
801 if err != nil {
802 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
803 }
804
805 unmarshalDate := time.Now()
806 f.verbosef("Unmarshaled %v objects for %v in %v\n",
807 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
808
809 tempMap := newPathMap("/")
810 stats := make([]statResponse, len(cachedNodes))
811
812 for i, node := range cachedNodes {
813 // check the file system for an updated timestamp
814 stats[i] = f.statDirSync(node.Path)
815 }
816
817 dirsToWalk = []string{}
818 for i, cachedNode := range cachedNodes {
819 updated := stats[i]
820 // save the cached value
821 container := tempMap.GetNode(cachedNode.Path, true)
822 container.mapNode = mapNode{statResponse: updated}
823
824 // if the metadata changed and the directory still exists, then
825 // make a note to walk it later
826 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
827 f.setModified()
828 // make a note that the directory needs to be walked
829 dirsToWalk = append(dirsToWalk, cachedNode.Path)
830 } else {
831 container.mapNode.FileNames = cachedNode.FileNames
832 }
833 }
834 // count the number of nodes to improve our understanding of the shape of the tree,
835 // thereby improving parallelism of subsequent searches
836 tempMap.UpdateNumDescendentsRecursive()
837
838 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
839 return tempMap, dirsToWalk, nil
840}
841
842// startFromExternalCache loads the cache database from disk
843// startFromExternalCache waits to return until the load of the cache db is complete, but
844// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
845func (f *Finder) startFromExternalCache() (err error) {
846 startTime := time.Now()
847 dbPath := f.DbPath
848
849 // open cache file and validate its header
850 reader, err := f.filesystem.Open(dbPath)
851 if err != nil {
852 return errors.New("No data to load from database\n")
853 }
Liz Kammer7fe24002022-02-07 13:18:15 -0500854 defer reader.Close()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700855 bufferedReader := bufio.NewReader(reader)
856 if !f.validateCacheHeader(bufferedReader) {
857 return errors.New("Cache header does not match")
858 }
859 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
860
861 // read the file and spawn threads to process it
862 nodesToWalk := [][]*pathMap{}
863 mainTree := newPathMap("/")
864
865 // read the blocks and stream them into <blockChannel>
866 type dataBlock struct {
867 id int
868 err error
869 data []byte
870 }
871 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
872 readBlocks := func() {
873 index := 0
874 for {
875 // It takes some time to unmarshal the input from json, so we want
876 // to unmarshal it in parallel. In order to find valid places to
877 // break the input, we scan for the line separators that we inserted
878 // (for this purpose) when we dumped the database.
879 data, err := f.readLine(bufferedReader)
880 var response dataBlock
881 done := false
882 if err != nil && err != io.EOF {
883 response = dataBlock{id: index, err: err, data: nil}
884 done = true
885 } else {
886 done = (err == io.EOF)
887 response = dataBlock{id: index, err: nil, data: data}
888 }
889 blockChannel <- response
890 index++
891 duration := time.Since(startTime)
892 f.verbosef("Read block %v after %v\n", index, duration)
893 if done {
894 f.verbosef("Read %v blocks in %v\n", index, duration)
895 close(blockChannel)
896 return
897 }
898 }
899 }
900 go readBlocks()
901
902 // Read from <blockChannel> and stream the responses into <resultChannel>.
903 type workResponse struct {
904 id int
905 err error
906 tree *pathMap
907 updatedDirs []string
908 }
909 resultChannel := make(chan workResponse)
910 processBlocks := func() {
911 numProcessed := 0
912 threadPool := newThreadPool(f.numDbLoadingThreads)
913 for {
914 // get a block to process
915 block, received := <-blockChannel
916 if !received {
917 break
918 }
919
920 if block.err != nil {
921 resultChannel <- workResponse{err: block.err}
922 break
923 }
924 numProcessed++
925 // wait until there is CPU available to process it
926 threadPool.Run(
927 func() {
928 processStartTime := time.Now()
929 f.verbosef("Starting to process block %v after %v\n",
930 block.id, processStartTime.Sub(startTime))
931 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
932 var response workResponse
933 if err != nil {
934 f.verbosef(
935 "Block %v failed to parse with error %v\n",
936 block.id, err)
937 response = workResponse{err: err}
938 } else {
939 response = workResponse{
940 id: block.id,
941 err: nil,
942 tree: tempMap,
943 updatedDirs: updatedDirs,
944 }
945 }
946 f.verbosef("Processed block %v in %v\n",
947 block.id, time.Since(processStartTime),
948 )
949 resultChannel <- response
950 },
951 )
952 }
953 threadPool.Wait()
954 f.verbosef("Finished processing %v blocks in %v\n",
955 numProcessed, time.Since(startTime))
956 close(resultChannel)
957 }
958 go processBlocks()
959
960 // Read from <resultChannel> and use the results
961 combineResults := func() (err error) {
962 for {
963 result, received := <-resultChannel
964 if !received {
965 break
966 }
967 if err != nil {
968 // In case of an error, wait for work to complete before
969 // returning the error. This ensures that any subsequent
970 // work doesn't need to compete for resources (and possibly
971 // fail due to, for example, a filesystem limit on the number of
972 // concurrently open files) with past work.
973 continue
974 }
975 if result.err != nil {
976 err = result.err
977 continue
978 }
979 // update main tree
980 mainTree.MergeIn(result.tree)
981 // record any new directories that we will need to Stat()
982 updatedNodes := make([]*pathMap, len(result.updatedDirs))
983 for j, dir := range result.updatedDirs {
984 node := mainTree.GetNode(dir, false)
985 updatedNodes[j] = node
986 }
987 nodesToWalk = append(nodesToWalk, updatedNodes)
988 }
989 return err
990 }
991 err = combineResults()
992 if err != nil {
993 return err
994 }
995
996 f.nodes = *mainTree
997
998 // after having loaded the entire db and therefore created entries for
999 // the directories we know of, now it's safe to start calling ReadDir on
1000 // any updated directories
1001 for i := range nodesToWalk {
1002 f.listDirsAsync(nodesToWalk[i])
1003 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001004 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
1005 f.threadPool.Wait()
1006 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
1007
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001008 return err
1009}
1010
1011// startWithoutExternalCache starts scanning the filesystem according to the cache config
1012// startWithoutExternalCache should be called if startFromExternalCache is not applicable
1013func (f *Finder) startWithoutExternalCache() {
Jeff Gastonb629e182017-08-14 16:49:18 -07001014 startTime := time.Now()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001015 configDirs := f.cacheMetadata.Config.RootDirs
1016
1017 // clean paths
1018 candidates := make([]string, len(configDirs))
1019 for i, dir := range configDirs {
1020 candidates[i] = filepath.Clean(dir)
1021 }
1022 // remove duplicates
1023 dirsToScan := make([]string, 0, len(configDirs))
1024 for _, candidate := range candidates {
1025 include := true
1026 for _, included := range dirsToScan {
1027 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
1028 include = false
1029 break
1030 }
1031 }
1032 if include {
1033 dirsToScan = append(dirsToScan, candidate)
1034 }
1035 }
1036
1037 // start searching finally
1038 for _, path := range dirsToScan {
1039 f.verbosef("Starting find of %v\n", path)
1040 f.startFind(path)
1041 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001042
1043 f.threadPool.Wait()
1044
1045 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001046}
1047
1048// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
1049func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
1050 if old.Inode != new.Inode {
1051 return false
1052 }
1053 if old.ModTime != new.ModTime {
1054 return false
1055 }
1056 if old.Device != new.Device {
1057 return false
1058 }
1059 return true
1060}
1061
1062func (f *Finder) wasModified() bool {
1063 return atomic.LoadInt32(&f.modifiedFlag) > 0
1064}
1065
1066func (f *Finder) setModified() {
1067 var newVal int32
1068 newVal = 1
1069 atomic.StoreInt32(&f.modifiedFlag, newVal)
1070}
1071
1072// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1073func (f *Finder) sortedDirEntries() []dirFullInfo {
1074 startTime := time.Now()
1075 nodes := make([]dirFullInfo, 0)
1076 for _, node := range f.nodes.DumpAll() {
1077 if node.ModTime != 0 {
1078 nodes = append(nodes, node)
1079 }
1080 }
1081 discoveryDate := time.Now()
1082 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1083 less := func(i int, j int) bool {
1084 return nodes[i].Path < nodes[j].Path
1085 }
1086 sort.Slice(nodes, less)
1087 sortDate := time.Now()
1088 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1089
1090 return nodes
1091}
1092
1093// serializeDb converts the cache database into a form to save to disk
1094func (f *Finder) serializeDb() ([]byte, error) {
1095 // sort dir entries
1096 var entryList = f.sortedDirEntries()
1097
1098 // Generate an output file that can be conveniently loaded using the same number of threads
1099 // as were used in this execution (because presumably that will be the number of threads
1100 // used in the next execution too)
1101
1102 // generate header
1103 header := []byte{}
1104 header = append(header, []byte(f.cacheMetadata.Version)...)
1105 header = append(header, lineSeparator)
1106 configDump, err := f.cacheMetadata.Config.Dump()
1107 if err != nil {
1108 return nil, err
1109 }
1110 header = append(header, configDump...)
1111
1112 // serialize individual blocks in parallel
1113 numBlocks := f.numDbLoadingThreads
1114 if numBlocks > len(entryList) {
1115 numBlocks = len(entryList)
1116 }
1117 blocks := make([][]byte, 1+numBlocks)
1118 blocks[0] = header
1119 blockMin := 0
1120 wg := sync.WaitGroup{}
1121 var errLock sync.Mutex
1122
1123 for i := 1; i <= numBlocks; i++ {
1124 // identify next block
1125 blockMax := len(entryList) * i / numBlocks
1126 block := entryList[blockMin:blockMax]
1127
1128 // process block
1129 wg.Add(1)
1130 go func(index int, block []dirFullInfo) {
1131 byteBlock, subErr := f.serializeCacheEntry(block)
1132 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1133 if subErr != nil {
1134 f.verbosef("%v\n", subErr.Error())
1135 errLock.Lock()
1136 err = subErr
1137 errLock.Unlock()
1138 } else {
1139 blocks[index] = byteBlock
1140 }
1141 wg.Done()
1142 }(i, block)
1143
1144 blockMin = blockMax
1145 }
1146
1147 wg.Wait()
1148
1149 if err != nil {
1150 return nil, err
1151 }
1152
1153 content := bytes.Join(blocks, []byte{lineSeparator})
1154
1155 return content, nil
1156}
1157
1158// dumpDb saves the cache database to disk
1159func (f *Finder) dumpDb() error {
1160 startTime := time.Now()
1161 f.verbosef("Dumping db\n")
1162
1163 tempPath := f.DbPath + ".tmp"
1164
1165 bytes, err := f.serializeDb()
1166 if err != nil {
1167 return err
1168 }
1169 serializeDate := time.Now()
1170 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1171 // dump file and atomically move
1172 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1173 if err != nil {
1174 return err
1175 }
1176 err = f.filesystem.Rename(tempPath, f.DbPath)
1177 if err != nil {
1178 return err
1179 }
1180
1181 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1182 return nil
Jeff Gastonb629e182017-08-14 16:49:18 -07001183
1184}
1185
1186// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
1187func (f *Finder) canIgnoreFsErr(err error) bool {
1188 pathErr, isPathErr := err.(*os.PathError)
1189 if !isPathErr {
1190 // Don't recognize this error
1191 return false
1192 }
Jeff Gaston821271b2017-08-23 17:53:38 -07001193 if os.IsPermission(pathErr) {
Jeff Gastonb629e182017-08-14 16:49:18 -07001194 // Permission errors are ignored:
1195 // https://issuetracker.google.com/37553659
1196 // https://github.com/google/kati/pull/116
1197 return true
1198 }
1199 if pathErr.Err == os.ErrNotExist {
1200 // If a directory doesn't exist, that generally means the cache is out-of-date
1201 return true
1202 }
1203 // Don't recognize this error
1204 return false
1205}
1206
1207// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
1208func (f *Finder) onFsError(path string, err error) {
1209 if !f.canIgnoreFsErr(err) {
1210 // We could send the errors through a channel instead, although that would cause this call
1211 // to block unless we preallocated a sufficient buffer or spawned a reader thread.
1212 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
1213 // more convenient to use a lock. Only in an unusual situation should this code be
1214 // invoked anyway.
1215 f.errlock.Lock()
1216 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
1217 f.errlock.Unlock()
1218 }
1219}
1220
1221// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
1222func (f *Finder) discardErrsForPrunedPaths() {
1223 // This function could be somewhat inefficient due to being single-threaded,
1224 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
1225 relevantErrs := make([]fsErr, 0, len(f.fsErrs))
1226 for _, fsErr := range f.fsErrs {
1227 path := fsErr.path
1228 node := f.nodes.GetNode(path, false)
1229 if node != nil {
1230 // The path in question wasn't pruned due to a failure to process a parent directory.
1231 // So, the failure to process this path is important
1232 relevantErrs = append(relevantErrs, fsErr)
1233 }
1234 }
1235 f.fsErrs = relevantErrs
1236}
1237
1238// getErr returns an error based on previous calls to onFsErr, if any
1239func (f *Finder) getErr() error {
1240 f.discardErrsForPrunedPaths()
1241
1242 numErrs := len(f.fsErrs)
1243 if numErrs < 1 {
1244 return nil
1245 }
1246
1247 maxNumErrsToInclude := 10
1248 message := ""
1249 if numErrs > maxNumErrsToInclude {
1250 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
1251 } else {
1252 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
1253 }
1254
1255 return errors.New(message)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001256}
1257
1258func (f *Finder) statDirAsync(dir *pathMap) {
1259 node := dir
1260 path := dir.path
1261 f.threadPool.Run(
1262 func() {
1263 updatedStats := f.statDirSync(path)
1264
1265 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1266 node.mapNode = mapNode{
1267 statResponse: updatedStats,
1268 FileNames: []string{},
1269 }
1270 f.setModified()
1271 if node.statResponse.ModTime != 0 {
1272 // modification time was updated, so re-scan for
1273 // child directories
1274 f.listDirAsync(dir)
1275 }
1276 }
1277 },
1278 )
1279}
1280
1281func (f *Finder) statDirSync(path string) statResponse {
1282
1283 fileInfo, err := f.filesystem.Lstat(path)
1284
1285 var stats statResponse
1286 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001287 // possibly record this error
1288 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001289 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1290 return stats
1291 }
1292 modTime := fileInfo.ModTime()
1293 stats = statResponse{}
1294 inode, err := f.filesystem.InodeNumber(fileInfo)
1295 if err != nil {
1296 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1297 }
1298 stats.Inode = inode
1299 device, err := f.filesystem.DeviceNumber(fileInfo)
1300 if err != nil {
1301 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1302 }
1303 stats.Device = device
1304 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1305
1306 if err != nil {
1307 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1308 }
1309 // We're only interested in knowing whether anything about the directory
1310 // has changed since last check, so we use the latest of the two
1311 // modification times (content modification (mtime) and
1312 // permission modification (ctime))
1313 if permissionsChangeTime.After(modTime) {
1314 modTime = permissionsChangeTime
1315 }
1316 stats.ModTime = modTime.UnixNano()
1317
1318 return stats
1319}
1320
Chris Parsonsa798d962020-10-12 23:44:08 -04001321func (f *Finder) shouldIncludeFile(fileName string) bool {
1322 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1323 if fileName == includedName {
1324 return true
1325 }
1326 }
1327 for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes {
1328 if strings.HasSuffix(fileName, includeSuffix) {
1329 return true
1330 }
1331 }
1332 return false
1333}
1334
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001335// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1336func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1337
1338 for _, fileName := range items.FileNames {
1339 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1340 if fileName == abortedName {
1341 items.FileNames = []string{}
1342 items.DirNames = []string{}
1343 return
1344 }
1345 }
1346 }
1347
1348 // remove any files that aren't the ones we want to include
1349 writeIndex := 0
1350 for _, fileName := range items.FileNames {
Chris Parsonsa798d962020-10-12 23:44:08 -04001351 if f.shouldIncludeFile(fileName) {
1352 items.FileNames[writeIndex] = fileName
1353 writeIndex++
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001354 }
1355 }
1356 // resize
1357 items.FileNames = items.FileNames[:writeIndex]
1358
1359 writeIndex = 0
1360 for _, dirName := range items.DirNames {
1361 items.DirNames[writeIndex] = dirName
1362 // ignore other dirs that are known to not be inputs to the build process
1363 include := true
1364 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1365 if dirName == excludedName {
1366 // don't include
1367 include = false
1368 break
1369 }
1370 }
1371 if include {
1372 writeIndex++
1373 }
1374 }
1375 // resize
1376 items.DirNames = items.DirNames[:writeIndex]
1377}
1378
1379func (f *Finder) listDirsAsync(nodes []*pathMap) {
1380 f.threadPool.Run(
1381 func() {
1382 for i := range nodes {
1383 f.listDirSync(nodes[i])
1384 }
1385 },
1386 )
1387}
1388
1389func (f *Finder) listDirAsync(node *pathMap) {
1390 f.threadPool.Run(
1391 func() {
1392 f.listDirSync(node)
1393 },
1394 )
1395}
1396
1397func (f *Finder) listDirSync(dir *pathMap) {
1398 path := dir.path
1399 children, err := f.filesystem.ReadDir(path)
1400
1401 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001402 // possibly record this error
1403 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001404 // if listing the contents of the directory fails (presumably due to
1405 // permission denied), then treat the directory as empty
Colin Crossa88c8832017-12-21 15:39:26 -08001406 children = nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001407 }
1408
1409 var subdirs []string
1410 var subfiles []string
1411
1412 for _, child := range children {
1413 linkBits := child.Mode() & os.ModeSymlink
1414 isLink := linkBits != 0
Colin Cross25fd7732020-06-29 23:11:55 -07001415 if isLink {
1416 childPath := filepath.Join(path, child.Name())
1417 childStat, err := f.filesystem.Stat(childPath)
1418 if err != nil {
1419 // If stat fails this is probably a broken or dangling symlink, treat it as a file.
1420 subfiles = append(subfiles, child.Name())
1421 } else if childStat.IsDir() {
Lukacs T. Berkie3487c82022-05-02 10:13:19 +02001422 // Skip symlink dirs if not requested otherwise. Android has a number
1423 // of symlinks creating infinite source trees which would otherwise get
1424 // us in an infinite loop.
1425 // TODO(b/197349722): Revisit this once symlink loops are banned in the
1426 // source tree.
1427 if f.cacheMetadata.Config.FollowSymlinks {
1428 subdirs = append(subdirs, child.Name())
1429 }
Colin Cross25fd7732020-06-29 23:11:55 -07001430 } else {
1431 // We do have to support symlink files because the link name might be
1432 // different than the target name
1433 // (for example, Android.bp -> build/soong/root.bp)
1434 subfiles = append(subfiles, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001435 }
Colin Cross25fd7732020-06-29 23:11:55 -07001436 } else if child.IsDir() {
1437 subdirs = append(subdirs, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001438 } else {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001439 subfiles = append(subfiles, child.Name())
1440 }
1441
1442 }
1443 parentNode := dir
1444
1445 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1446 f.pruneCacheCandidates(entry)
1447
1448 // create a pathMap node for each relevant subdirectory
1449 relevantChildren := map[string]*pathMap{}
1450 for _, subdirName := range entry.DirNames {
1451 childNode, found := parentNode.children[subdirName]
1452 // if we already knew of this directory, then we already have a request pending to Stat it
1453 // if we didn't already know of this directory, then we must Stat it now
1454 if !found {
1455 childNode = parentNode.newChild(subdirName)
1456 f.statDirAsync(childNode)
1457 }
1458 relevantChildren[subdirName] = childNode
1459 }
1460 // Note that in rare cases, it's possible that we're reducing the set of
1461 // children via this statement, if these are all true:
1462 // 1. we previously had a cache that knew about subdirectories of parentNode
1463 // 2. the user created a prune-file (described in pruneCacheCandidates)
1464 // inside <parentNode>, which specifies that the contents of parentNode
1465 // are to be ignored.
1466 // The fact that it's possible to remove children here means that *pathMap structs
1467 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1468 // direct pointer) until after every listDirSync completes
1469 parentNode.FileNames = entry.FileNames
1470 parentNode.children = relevantChildren
1471
1472}
1473
1474// listMatches takes a node and a function that specifies which subdirectories and
1475// files to include, and listMatches returns the matches
1476func (f *Finder) listMatches(node *pathMap,
1477 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1478 entries := DirEntries{
1479 FileNames: node.FileNames,
1480 }
1481 entries.DirNames = make([]string, 0, len(node.children))
1482 for childName := range node.children {
1483 entries.DirNames = append(entries.DirNames, childName)
1484 }
1485
1486 dirNames, fileNames := filter(entries)
1487
1488 subDirs = []*pathMap{}
1489 filePaths = make([]string, 0, len(fileNames))
1490 for _, fileName := range fileNames {
1491 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1492 }
1493 subDirs = make([]*pathMap, 0, len(dirNames))
1494 for _, childName := range dirNames {
1495 child, ok := node.children[childName]
1496 if ok {
1497 subDirs = append(subDirs, child)
1498 }
1499 }
1500
1501 return subDirs, filePaths
1502}
1503
1504// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1505func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1506 approxNumThreads int) []string {
1507
1508 if approxNumThreads < 2 {
1509 // Done spawning threads; process remaining directories
1510 return f.findInCacheSinglethreaded(node, filter)
1511 }
1512
1513 totalWork := 0
1514 for _, child := range node.children {
1515 totalWork += child.approximateNumDescendents
1516 }
1517 childrenResults := make(chan []string, len(node.children))
1518
1519 subDirs, filePaths := f.listMatches(node, filter)
1520
1521 // process child directories
1522 for _, child := range subDirs {
1523 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1524 childProcessor := func(child *pathMap) {
1525 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1526 childrenResults <- childResults
1527 }
1528 // If we're allowed to use more than 1 thread to process this directory,
1529 // then instead we use 1 thread for each subdirectory.
1530 // It would be strange to spawn threads for only some subdirectories.
1531 go childProcessor(child)
1532 }
1533
1534 // collect results
1535 for i := 0; i < len(subDirs); i++ {
1536 childResults := <-childrenResults
1537 filePaths = append(filePaths, childResults...)
1538 }
1539 close(childrenResults)
1540
1541 return filePaths
1542}
1543
1544// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1545// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1546func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1547 if node == nil {
1548 return []string{}
1549 }
1550
1551 nodes := []*pathMap{node}
1552 matches := []string{}
1553
1554 for len(nodes) > 0 {
1555 currentNode := nodes[0]
1556 nodes = nodes[1:]
1557
1558 subDirs, filePaths := f.listMatches(currentNode, filter)
1559
1560 nodes = append(nodes, subDirs...)
1561
1562 matches = append(matches, filePaths...)
1563 }
1564 return matches
1565}