blob: b4834b16be7e63c2be6e3aeb62cc25f037a5a3ee [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
Jeff Gastond3119522017-08-22 14:11:15 -070021 "errors"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070022 "fmt"
23 "io"
24 "os"
25 "path/filepath"
26 "runtime"
27 "sort"
28 "strings"
29 "sync"
30 "sync/atomic"
31 "time"
32
Colin Cross8d6395c2017-12-21 15:46:01 -080033 "android/soong/finder/fs"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070034)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
97 // ExcludeDirs are directory names that if encountered are removed from the search
98 ExcludeDirs []string
99
100 // PruneFiles are file names that if encountered prune their entire directory
101 // (including siblings)
102 PruneFiles []string
103
104 // IncludeFiles are file names to include as matches
105 IncludeFiles []string
Chris Parsonsa798d962020-10-12 23:44:08 -0400106
107 // IncludeSuffixes are filename suffixes to include as matches.
108 IncludeSuffixes []string
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700109}
110
111// a cacheConfig stores the inputs that determine what should be included in the cache
112type cacheConfig struct {
113 CacheParams
114
115 // FilesystemView is a unique identifier telling which parts of which file systems
116 // are readable by the Finder. In practice its value is essentially username@hostname.
117 // FilesystemView is set to ensure that a cache file copied to another host or
118 // found by another user doesn't inadvertently get reused.
119 FilesystemView string
120}
121
122func (p *cacheConfig) Dump() ([]byte, error) {
123 bytes, err := json.Marshal(p)
124 return bytes, err
125}
126
127// a cacheMetadata stores version information about the cache
128type cacheMetadata struct {
129 // The Version enables the Finder to determine whether it can even parse the file
130 // If the version changes, the entire cache file must be regenerated
131 Version string
132
133 // The CacheParams enables the Finder to determine whether the parameters match
134 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
135 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
136 Config cacheConfig
137}
138
139type Logger interface {
140 Output(calldepth int, s string) error
141}
142
143// the Finder is the main struct that callers will want to use
144type Finder struct {
145 // configuration
146 DbPath string
147 numDbLoadingThreads int
148 numSearchingThreads int
149 cacheMetadata cacheMetadata
150 logger Logger
151 filesystem fs.FileSystem
152
153 // temporary state
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700154 threadPool *threadPool
155 mutex sync.Mutex
156 fsErrs []fsErr
157 errlock sync.Mutex
158 shutdownWaitgroup sync.WaitGroup
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700159
160 // non-temporary state
161 modifiedFlag int32
162 nodes pathMap
163}
164
Jeff Gastond3119522017-08-22 14:11:15 -0700165var defaultNumThreads = runtime.NumCPU() * 2
166
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700167// New creates a new Finder for use
168func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gastonb629e182017-08-14 16:49:18 -0700169 logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gastond3119522017-08-22 14:11:15 -0700170 return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads)
171}
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700172
Jeff Gastond3119522017-08-22 14:11:15 -0700173// newImpl is like New but accepts more params
174func newImpl(cacheParams CacheParams, filesystem fs.FileSystem,
175 logger Logger, dbPath string, numThreads int) (f *Finder, err error) {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700176 numDbLoadingThreads := numThreads
177 numSearchingThreads := numThreads
178
179 metadata := cacheMetadata{
180 Version: versionString,
181 Config: cacheConfig{
182 CacheParams: cacheParams,
183 FilesystemView: filesystem.ViewId(),
184 },
185 }
186
Jeff Gastonb629e182017-08-14 16:49:18 -0700187 f = &Finder{
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700188 numDbLoadingThreads: numDbLoadingThreads,
189 numSearchingThreads: numSearchingThreads,
190 cacheMetadata: metadata,
191 logger: logger,
192 filesystem: filesystem,
193
194 nodes: *newPathMap("/"),
195 DbPath: dbPath,
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700196
197 shutdownWaitgroup: sync.WaitGroup{},
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700198 }
199
Jeff Gastonb629e182017-08-14 16:49:18 -0700200 f.loadFromFilesystem()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700201
Jeff Gastonb629e182017-08-14 16:49:18 -0700202 // check for any filesystem errors
203 err = f.getErr()
204 if err != nil {
205 return nil, err
206 }
207
208 // confirm that every path mentioned in the CacheConfig exists
209 for _, path := range cacheParams.RootDirs {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700210 if !filepath.IsAbs(path) {
211 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
212 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700213 node := f.nodes.GetNode(filepath.Clean(path), false)
214 if node == nil || node.ModTime == 0 {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700215 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gastonb629e182017-08-14 16:49:18 -0700216 }
217 }
218
219 return f, nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700220}
221
222// FindNamed searches for every cached file
223func (f *Finder) FindAll() []string {
224 return f.FindAt("/")
225}
226
227// FindNamed searches for every cached file under <rootDir>
228func (f *Finder) FindAt(rootDir string) []string {
229 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
230 return entries.DirNames, entries.FileNames
231 }
232 return f.FindMatching(rootDir, filter)
233}
234
235// FindNamed searches for every cached file named <fileName>
236func (f *Finder) FindNamed(fileName string) []string {
237 return f.FindNamedAt("/", fileName)
238}
239
240// FindNamedAt searches under <rootPath> for every file named <fileName>
241// The reason a caller might use FindNamedAt instead of FindNamed is if they want
242// to limit their search to a subset of the cache
243func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
244 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
245 matches := []string{}
246 for _, foundName := range entries.FileNames {
247 if foundName == fileName {
248 matches = append(matches, foundName)
249 }
250 }
251 return entries.DirNames, matches
252
253 }
254 return f.FindMatching(rootPath, filter)
255}
256
257// FindFirstNamed searches for every file named <fileName>
258// Whenever it finds a match, it stops search subdirectories
259func (f *Finder) FindFirstNamed(fileName string) []string {
260 return f.FindFirstNamedAt("/", fileName)
261}
262
263// FindFirstNamedAt searches for every file named <fileName>
264// Whenever it finds a match, it stops search subdirectories
265func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
266 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
267 matches := []string{}
268 for _, foundName := range entries.FileNames {
269 if foundName == fileName {
270 matches = append(matches, foundName)
271 }
272 }
273
274 if len(matches) > 0 {
275 return []string{}, matches
276 }
277 return entries.DirNames, matches
278 }
279 return f.FindMatching(rootPath, filter)
280}
281
282// FindMatching is the most general exported function for searching for files in the cache
283// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
284// in place, removing file paths and directories as desired.
285// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
286func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
287 // set up some parameters
288 scanStart := time.Now()
289 var isRel bool
290 workingDir := f.cacheMetadata.Config.WorkingDirectory
291
292 isRel = !filepath.IsAbs(rootPath)
293 if isRel {
294 rootPath = filepath.Join(workingDir, rootPath)
295 }
296
297 rootPath = filepath.Clean(rootPath)
298
299 // ensure nothing else is using the Finder
300 f.verbosef("FindMatching waiting for finder to be idle\n")
301 f.lock()
302 defer f.unlock()
303
304 node := f.nodes.GetNode(rootPath, false)
305 if node == nil {
306 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
307 rootPath, f.cacheMetadata.Config.CacheParams)
308 // path is not found; don't do a search
309 return []string{}
310 }
311
312 // search for matching files
313 f.verbosef("Finder finding %v using cache\n", rootPath)
314 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
315
316 // format and return results
317 if isRel {
318 for i := 0; i < len(results); i++ {
319 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
320 }
321 }
322 sort.Strings(results)
323 f.verbosef("Found %v files under %v in %v using cache\n",
324 len(results), rootPath, time.Since(scanStart))
325 return results
326}
327
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700328// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
329// Currently, that only entails waiting for the database dump to complete.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700330func (f *Finder) Shutdown() {
Colin Cross96e5e412020-07-06 17:13:43 -0700331 f.WaitForDbDump()
332}
333
334// WaitForDbDump returns once the database has been written to f.DbPath.
335func (f *Finder) WaitForDbDump() {
336 f.shutdownWaitgroup.Wait()
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700337}
338
339// End of public api
340
341func (f *Finder) goDumpDb() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700342 if f.wasModified() {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700343 f.shutdownWaitgroup.Add(1)
344 go func() {
345 err := f.dumpDb()
346 if err != nil {
347 f.verbosef("%v\n", err)
348 }
349 f.shutdownWaitgroup.Done()
350 }()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700351 } else {
352 f.verbosef("Skipping dumping unmodified db\n")
353 }
354}
355
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700356// joinCleanPaths is like filepath.Join but is faster because
357// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
358func joinCleanPaths(base string, leaf string) string {
359 if base == "" {
360 return leaf
361 }
362 if base == "/" {
363 return base + leaf
364 }
365 if leaf == "" {
366 return base
367 }
368 return base + "/" + leaf
369}
370
371func (f *Finder) verbosef(format string, args ...interface{}) {
372 f.logger.Output(2, fmt.Sprintf(format, args...))
373}
374
375// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
376func (f *Finder) loadFromFilesystem() {
377 f.threadPool = newThreadPool(f.numDbLoadingThreads)
378
379 err := f.startFromExternalCache()
380 if err != nil {
381 f.startWithoutExternalCache()
382 }
383
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700384 f.goDumpDb()
385
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700386 f.threadPool = nil
387}
388
389func (f *Finder) startFind(path string) {
390 if !filepath.IsAbs(path) {
391 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
392 }
393 node := f.nodes.GetNode(path, true)
394 f.statDirAsync(node)
395}
396
397func (f *Finder) lock() {
398 f.mutex.Lock()
399}
400
401func (f *Finder) unlock() {
402 f.mutex.Unlock()
403}
404
405// a statResponse is the relevant portion of the response from the filesystem to a Stat call
406type statResponse struct {
407 ModTime int64
408 Inode uint64
409 Device uint64
410}
411
412// a pathAndStats stores a path and its stats
413type pathAndStats struct {
414 statResponse
415
416 Path string
417}
418
419// a dirFullInfo stores all of the relevant information we know about a directory
420type dirFullInfo struct {
421 pathAndStats
422
423 FileNames []string
424}
425
426// a PersistedDirInfo is the information about a dir that we save to our cache on disk
427type PersistedDirInfo struct {
428 // These field names are short because they are repeated many times in the output json file
429 P string // path
430 T int64 // modification time
431 I uint64 // inode number
432 F []string // relevant filenames contained
433}
434
435// a PersistedDirs is the information that we persist for a group of dirs
436type PersistedDirs struct {
437 // the device on which each directory is stored
438 Device uint64
439 // the common root path to which all contained dirs are relative
440 Root string
441 // the directories themselves
442 Dirs []PersistedDirInfo
443}
444
445// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
446type CacheEntry []PersistedDirs
447
448// a DirEntries lists the files and directories contained directly within a specific directory
449type DirEntries struct {
450 Path string
451
452 // elements of DirNames are just the dir names; they don't include any '/' character
453 DirNames []string
454 // elements of FileNames are just the file names; they don't include '/' character
455 FileNames []string
456}
457
458// a WalkFunc is the type that is passed into various Find functions for determining which
459// directories the caller wishes be walked. The WalkFunc is expected to decide which
460// directories to walk and which files to consider as matches to the original query.
461type WalkFunc func(DirEntries) (dirs []string, files []string)
462
463// a mapNode stores the relevant stats about a directory to be stored in a pathMap
464type mapNode struct {
465 statResponse
466 FileNames []string
467}
468
469// a pathMap implements the directory tree structure of nodes
470type pathMap struct {
471 mapNode
472
473 path string
474
475 children map[string]*pathMap
476
477 // number of descendent nodes, including self
478 approximateNumDescendents int
479}
480
481func newPathMap(path string) *pathMap {
482 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
483 approximateNumDescendents: 1}
484 return result
485}
486
487// GetNode returns the node at <path>
488func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
489 if len(path) > 0 && path[0] == '/' {
490 path = path[1:]
491 }
492
493 node := m
494 for {
495 if path == "" {
496 return node
497 }
498
499 index := strings.Index(path, "/")
500 var firstComponent string
501 if index >= 0 {
502 firstComponent = path[:index]
503 path = path[index+1:]
504 } else {
505 firstComponent = path
506 path = ""
507 }
508
509 child, found := node.children[firstComponent]
510
511 if !found {
512 if createIfNotFound {
513 child = node.newChild(firstComponent)
514 } else {
515 return nil
516 }
517 }
518
519 node = child
520 }
521}
522
523func (m *pathMap) newChild(name string) (child *pathMap) {
524 path := joinCleanPaths(m.path, name)
525 newChild := newPathMap(path)
526 m.children[name] = newChild
527
528 return m.children[name]
529}
530
531func (m *pathMap) UpdateNumDescendents() int {
532 count := 1
533 for _, child := range m.children {
534 count += child.approximateNumDescendents
535 }
536 m.approximateNumDescendents = count
537 return count
538}
539
540func (m *pathMap) UpdateNumDescendentsRecursive() {
541 for _, child := range m.children {
542 child.UpdateNumDescendentsRecursive()
543 }
544 m.UpdateNumDescendents()
545}
546
547func (m *pathMap) MergeIn(other *pathMap) {
548 for key, theirs := range other.children {
549 ours, found := m.children[key]
550 if found {
551 ours.MergeIn(theirs)
552 } else {
553 m.children[key] = theirs
554 }
555 }
556 if other.ModTime != 0 {
557 m.mapNode = other.mapNode
558 }
559 m.UpdateNumDescendents()
560}
561
562func (m *pathMap) DumpAll() []dirFullInfo {
563 results := []dirFullInfo{}
564 m.dumpInto("", &results)
565 return results
566}
567
568func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
569 *results = append(*results,
570 dirFullInfo{
571 pathAndStats{statResponse: m.statResponse, Path: path},
572 m.FileNames},
573 )
574 for key, child := range m.children {
575 childPath := joinCleanPaths(path, key)
576 if len(childPath) == 0 || childPath[0] != '/' {
577 childPath = "/" + childPath
578 }
579 child.dumpInto(childPath, results)
580 }
581}
582
583// a semaphore can be locked by up to <capacity> callers at once
584type semaphore struct {
585 pool chan bool
586}
587
588func newSemaphore(capacity int) *semaphore {
589 return &semaphore{pool: make(chan bool, capacity)}
590}
591
592func (l *semaphore) Lock() {
593 l.pool <- true
594}
595
596func (l *semaphore) Unlock() {
597 <-l.pool
598}
599
600// A threadPool runs goroutines and supports throttling and waiting.
601// Without throttling, Go may exhaust the maximum number of various resources, such as
602// threads or file descriptors, and crash the program.
603type threadPool struct {
604 receivedRequests sync.WaitGroup
605 activeRequests semaphore
606}
607
608func newThreadPool(maxNumConcurrentThreads int) *threadPool {
609 return &threadPool{
610 receivedRequests: sync.WaitGroup{},
611 activeRequests: *newSemaphore(maxNumConcurrentThreads),
612 }
613}
614
615// Run requests to run the given function in its own goroutine
616func (p *threadPool) Run(function func()) {
617 p.receivedRequests.Add(1)
618 // If Run() was called from within a goroutine spawned by this threadPool,
619 // then we may need to return from Run() before having capacity to actually
620 // run <function>.
621 //
622 // It's possible that the body of <function> contains a statement (such as a syscall)
623 // that will cause Go to pin it to a thread, or will contain a statement that uses
624 // another resource that is in short supply (such as a file descriptor), so we can't
625 // actually run <function> until we have capacity.
626 //
627 // However, the semaphore used for synchronization is implemented via a channel and
628 // shouldn't require a new thread for each access.
629 go func() {
630 p.activeRequests.Lock()
631 function()
632 p.activeRequests.Unlock()
633 p.receivedRequests.Done()
634 }()
635}
636
637// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
638func (p *threadPool) Wait() {
639 p.receivedRequests.Wait()
640}
641
Jeff Gastonb629e182017-08-14 16:49:18 -0700642type fsErr struct {
643 path string
644 err error
645}
646
647func (e fsErr) String() string {
648 return e.path + ": " + e.err.Error()
649}
650
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700651func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
652 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
653 dirsByDevice := map[uint64][]PersistedDirInfo{}
654 for _, entry := range dirInfos {
655 _, found := dirsByDevice[entry.Device]
656 if !found {
657 dirsByDevice[entry.Device] = []PersistedDirInfo{}
658 }
659 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
660 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
661 }
662
663 cacheEntry := CacheEntry{}
664
665 for device, infos := range dirsByDevice {
666 // find common prefix
667 prefix := ""
668 if len(infos) > 0 {
669 prefix = infos[0].P
670 }
671 for _, info := range infos {
672 for !strings.HasPrefix(info.P+"/", prefix+"/") {
673 prefix = filepath.Dir(prefix)
Jeff Gaston996716a2017-08-22 13:33:19 -0700674 if prefix == "/" {
675 break
676 }
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700677 }
678 }
679 // remove common prefix
680 for i := range infos {
681 suffix := strings.Replace(infos[i].P, prefix, "", 1)
682 if len(suffix) > 0 && suffix[0] == '/' {
683 suffix = suffix[1:]
684 }
685 infos[i].P = suffix
686 }
687
688 // turn the map (keyed by device) into a list of structs with labeled fields
689 // this is to improve readability of the output
690 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
691 }
692
693 // convert to json.
694 // it would save some space to use a different format than json for the db file,
695 // but the space and time savings are small, and json is easy for humans to read
696 bytes, err := json.Marshal(cacheEntry)
697 return bytes, err
698}
699
700func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
701 var cacheEntry CacheEntry
702 err := json.Unmarshal(bytes, &cacheEntry)
703 if err != nil {
704 return nil, err
705 }
706
707 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
708 capacity := 0
709 for _, element := range cacheEntry {
710 capacity += len(element.Dirs)
711 }
712 nodes := make([]dirFullInfo, capacity)
713 count := 0
714 for _, element := range cacheEntry {
715 for _, dir := range element.Dirs {
716 path := joinCleanPaths(element.Root, dir.P)
717
718 nodes[count] = dirFullInfo{
719 pathAndStats: pathAndStats{
720 statResponse: statResponse{
721 ModTime: dir.T, Inode: dir.I, Device: element.Device,
722 },
723 Path: path},
724 FileNames: dir.F}
725 count++
726 }
727 }
728 return nodes, nil
729}
730
731// We use the following separator byte to distinguish individually parseable blocks of json
732// because we know this separator won't appear in the json that we're parsing.
733//
734// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
735// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
736// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
737// character.
738//
739// We know that the newline character will never appear in our json string, because:
740// - If a newline character appears as part of a data string, then json encoding will
741// emit two characters instead: '\' and 'n'.
742// - The json encoder that we use doesn't emit the optional newlines between any of its
743// other outputs.
744const lineSeparator = byte('\n')
745
746func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
747 return reader.ReadBytes(lineSeparator)
748}
749
750// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
751func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
752 cacheVersionBytes, err := f.readLine(cacheReader)
753 if err != nil {
754 f.verbosef("Failed to read database header; database is invalid\n")
755 return false
756 }
757 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
758 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
759 }
760 cacheVersionString := string(cacheVersionBytes)
761 currentVersion := f.cacheMetadata.Version
762 if cacheVersionString != currentVersion {
763 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
764 return false
765 }
766
767 cacheParamBytes, err := f.readLine(cacheReader)
768 if err != nil {
769 f.verbosef("Failed to read database search params; database is invalid\n")
770 return false
771 }
772
773 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
774 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
775 }
776
777 currentParamBytes, err := f.cacheMetadata.Config.Dump()
778 if err != nil {
779 panic("Finder failed to serialize its parameters")
780 }
781 cacheParamString := string(cacheParamBytes)
782 currentParamString := string(currentParamBytes)
783 if cacheParamString != currentParamString {
784 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
785 return false
786 }
787 return true
788}
789
790// loadBytes compares the cache info in <data> to the state of the filesystem
791// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
792func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
793
794 helperStartTime := time.Now()
795
796 cachedNodes, err := f.parseCacheEntry(data)
797 if err != nil {
798 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
799 }
800
801 unmarshalDate := time.Now()
802 f.verbosef("Unmarshaled %v objects for %v in %v\n",
803 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
804
805 tempMap := newPathMap("/")
806 stats := make([]statResponse, len(cachedNodes))
807
808 for i, node := range cachedNodes {
809 // check the file system for an updated timestamp
810 stats[i] = f.statDirSync(node.Path)
811 }
812
813 dirsToWalk = []string{}
814 for i, cachedNode := range cachedNodes {
815 updated := stats[i]
816 // save the cached value
817 container := tempMap.GetNode(cachedNode.Path, true)
818 container.mapNode = mapNode{statResponse: updated}
819
820 // if the metadata changed and the directory still exists, then
821 // make a note to walk it later
822 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
823 f.setModified()
824 // make a note that the directory needs to be walked
825 dirsToWalk = append(dirsToWalk, cachedNode.Path)
826 } else {
827 container.mapNode.FileNames = cachedNode.FileNames
828 }
829 }
830 // count the number of nodes to improve our understanding of the shape of the tree,
831 // thereby improving parallelism of subsequent searches
832 tempMap.UpdateNumDescendentsRecursive()
833
834 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
835 return tempMap, dirsToWalk, nil
836}
837
838// startFromExternalCache loads the cache database from disk
839// startFromExternalCache waits to return until the load of the cache db is complete, but
840// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
841func (f *Finder) startFromExternalCache() (err error) {
842 startTime := time.Now()
843 dbPath := f.DbPath
844
845 // open cache file and validate its header
846 reader, err := f.filesystem.Open(dbPath)
847 if err != nil {
848 return errors.New("No data to load from database\n")
849 }
Liz Kammer7fe24002022-02-07 13:18:15 -0500850 defer reader.Close()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700851 bufferedReader := bufio.NewReader(reader)
852 if !f.validateCacheHeader(bufferedReader) {
853 return errors.New("Cache header does not match")
854 }
855 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
856
857 // read the file and spawn threads to process it
858 nodesToWalk := [][]*pathMap{}
859 mainTree := newPathMap("/")
860
861 // read the blocks and stream them into <blockChannel>
862 type dataBlock struct {
863 id int
864 err error
865 data []byte
866 }
867 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
868 readBlocks := func() {
869 index := 0
870 for {
871 // It takes some time to unmarshal the input from json, so we want
872 // to unmarshal it in parallel. In order to find valid places to
873 // break the input, we scan for the line separators that we inserted
874 // (for this purpose) when we dumped the database.
875 data, err := f.readLine(bufferedReader)
876 var response dataBlock
877 done := false
878 if err != nil && err != io.EOF {
879 response = dataBlock{id: index, err: err, data: nil}
880 done = true
881 } else {
882 done = (err == io.EOF)
883 response = dataBlock{id: index, err: nil, data: data}
884 }
885 blockChannel <- response
886 index++
887 duration := time.Since(startTime)
888 f.verbosef("Read block %v after %v\n", index, duration)
889 if done {
890 f.verbosef("Read %v blocks in %v\n", index, duration)
891 close(blockChannel)
892 return
893 }
894 }
895 }
896 go readBlocks()
897
898 // Read from <blockChannel> and stream the responses into <resultChannel>.
899 type workResponse struct {
900 id int
901 err error
902 tree *pathMap
903 updatedDirs []string
904 }
905 resultChannel := make(chan workResponse)
906 processBlocks := func() {
907 numProcessed := 0
908 threadPool := newThreadPool(f.numDbLoadingThreads)
909 for {
910 // get a block to process
911 block, received := <-blockChannel
912 if !received {
913 break
914 }
915
916 if block.err != nil {
917 resultChannel <- workResponse{err: block.err}
918 break
919 }
920 numProcessed++
921 // wait until there is CPU available to process it
922 threadPool.Run(
923 func() {
924 processStartTime := time.Now()
925 f.verbosef("Starting to process block %v after %v\n",
926 block.id, processStartTime.Sub(startTime))
927 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
928 var response workResponse
929 if err != nil {
930 f.verbosef(
931 "Block %v failed to parse with error %v\n",
932 block.id, err)
933 response = workResponse{err: err}
934 } else {
935 response = workResponse{
936 id: block.id,
937 err: nil,
938 tree: tempMap,
939 updatedDirs: updatedDirs,
940 }
941 }
942 f.verbosef("Processed block %v in %v\n",
943 block.id, time.Since(processStartTime),
944 )
945 resultChannel <- response
946 },
947 )
948 }
949 threadPool.Wait()
950 f.verbosef("Finished processing %v blocks in %v\n",
951 numProcessed, time.Since(startTime))
952 close(resultChannel)
953 }
954 go processBlocks()
955
956 // Read from <resultChannel> and use the results
957 combineResults := func() (err error) {
958 for {
959 result, received := <-resultChannel
960 if !received {
961 break
962 }
963 if err != nil {
964 // In case of an error, wait for work to complete before
965 // returning the error. This ensures that any subsequent
966 // work doesn't need to compete for resources (and possibly
967 // fail due to, for example, a filesystem limit on the number of
968 // concurrently open files) with past work.
969 continue
970 }
971 if result.err != nil {
972 err = result.err
973 continue
974 }
975 // update main tree
976 mainTree.MergeIn(result.tree)
977 // record any new directories that we will need to Stat()
978 updatedNodes := make([]*pathMap, len(result.updatedDirs))
979 for j, dir := range result.updatedDirs {
980 node := mainTree.GetNode(dir, false)
981 updatedNodes[j] = node
982 }
983 nodesToWalk = append(nodesToWalk, updatedNodes)
984 }
985 return err
986 }
987 err = combineResults()
988 if err != nil {
989 return err
990 }
991
992 f.nodes = *mainTree
993
994 // after having loaded the entire db and therefore created entries for
995 // the directories we know of, now it's safe to start calling ReadDir on
996 // any updated directories
997 for i := range nodesToWalk {
998 f.listDirsAsync(nodesToWalk[i])
999 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001000 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
1001 f.threadPool.Wait()
1002 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
1003
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001004 return err
1005}
1006
1007// startWithoutExternalCache starts scanning the filesystem according to the cache config
1008// startWithoutExternalCache should be called if startFromExternalCache is not applicable
1009func (f *Finder) startWithoutExternalCache() {
Jeff Gastonb629e182017-08-14 16:49:18 -07001010 startTime := time.Now()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001011 configDirs := f.cacheMetadata.Config.RootDirs
1012
1013 // clean paths
1014 candidates := make([]string, len(configDirs))
1015 for i, dir := range configDirs {
1016 candidates[i] = filepath.Clean(dir)
1017 }
1018 // remove duplicates
1019 dirsToScan := make([]string, 0, len(configDirs))
1020 for _, candidate := range candidates {
1021 include := true
1022 for _, included := range dirsToScan {
1023 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
1024 include = false
1025 break
1026 }
1027 }
1028 if include {
1029 dirsToScan = append(dirsToScan, candidate)
1030 }
1031 }
1032
1033 // start searching finally
1034 for _, path := range dirsToScan {
1035 f.verbosef("Starting find of %v\n", path)
1036 f.startFind(path)
1037 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001038
1039 f.threadPool.Wait()
1040
1041 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001042}
1043
1044// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
1045func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
1046 if old.Inode != new.Inode {
1047 return false
1048 }
1049 if old.ModTime != new.ModTime {
1050 return false
1051 }
1052 if old.Device != new.Device {
1053 return false
1054 }
1055 return true
1056}
1057
1058func (f *Finder) wasModified() bool {
1059 return atomic.LoadInt32(&f.modifiedFlag) > 0
1060}
1061
1062func (f *Finder) setModified() {
1063 var newVal int32
1064 newVal = 1
1065 atomic.StoreInt32(&f.modifiedFlag, newVal)
1066}
1067
1068// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1069func (f *Finder) sortedDirEntries() []dirFullInfo {
1070 startTime := time.Now()
1071 nodes := make([]dirFullInfo, 0)
1072 for _, node := range f.nodes.DumpAll() {
1073 if node.ModTime != 0 {
1074 nodes = append(nodes, node)
1075 }
1076 }
1077 discoveryDate := time.Now()
1078 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1079 less := func(i int, j int) bool {
1080 return nodes[i].Path < nodes[j].Path
1081 }
1082 sort.Slice(nodes, less)
1083 sortDate := time.Now()
1084 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1085
1086 return nodes
1087}
1088
1089// serializeDb converts the cache database into a form to save to disk
1090func (f *Finder) serializeDb() ([]byte, error) {
1091 // sort dir entries
1092 var entryList = f.sortedDirEntries()
1093
1094 // Generate an output file that can be conveniently loaded using the same number of threads
1095 // as were used in this execution (because presumably that will be the number of threads
1096 // used in the next execution too)
1097
1098 // generate header
1099 header := []byte{}
1100 header = append(header, []byte(f.cacheMetadata.Version)...)
1101 header = append(header, lineSeparator)
1102 configDump, err := f.cacheMetadata.Config.Dump()
1103 if err != nil {
1104 return nil, err
1105 }
1106 header = append(header, configDump...)
1107
1108 // serialize individual blocks in parallel
1109 numBlocks := f.numDbLoadingThreads
1110 if numBlocks > len(entryList) {
1111 numBlocks = len(entryList)
1112 }
1113 blocks := make([][]byte, 1+numBlocks)
1114 blocks[0] = header
1115 blockMin := 0
1116 wg := sync.WaitGroup{}
1117 var errLock sync.Mutex
1118
1119 for i := 1; i <= numBlocks; i++ {
1120 // identify next block
1121 blockMax := len(entryList) * i / numBlocks
1122 block := entryList[blockMin:blockMax]
1123
1124 // process block
1125 wg.Add(1)
1126 go func(index int, block []dirFullInfo) {
1127 byteBlock, subErr := f.serializeCacheEntry(block)
1128 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1129 if subErr != nil {
1130 f.verbosef("%v\n", subErr.Error())
1131 errLock.Lock()
1132 err = subErr
1133 errLock.Unlock()
1134 } else {
1135 blocks[index] = byteBlock
1136 }
1137 wg.Done()
1138 }(i, block)
1139
1140 blockMin = blockMax
1141 }
1142
1143 wg.Wait()
1144
1145 if err != nil {
1146 return nil, err
1147 }
1148
1149 content := bytes.Join(blocks, []byte{lineSeparator})
1150
1151 return content, nil
1152}
1153
1154// dumpDb saves the cache database to disk
1155func (f *Finder) dumpDb() error {
1156 startTime := time.Now()
1157 f.verbosef("Dumping db\n")
1158
1159 tempPath := f.DbPath + ".tmp"
1160
1161 bytes, err := f.serializeDb()
1162 if err != nil {
1163 return err
1164 }
1165 serializeDate := time.Now()
1166 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1167 // dump file and atomically move
1168 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1169 if err != nil {
1170 return err
1171 }
1172 err = f.filesystem.Rename(tempPath, f.DbPath)
1173 if err != nil {
1174 return err
1175 }
1176
1177 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1178 return nil
Jeff Gastonb629e182017-08-14 16:49:18 -07001179
1180}
1181
1182// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
1183func (f *Finder) canIgnoreFsErr(err error) bool {
1184 pathErr, isPathErr := err.(*os.PathError)
1185 if !isPathErr {
1186 // Don't recognize this error
1187 return false
1188 }
Jeff Gaston821271b2017-08-23 17:53:38 -07001189 if os.IsPermission(pathErr) {
Jeff Gastonb629e182017-08-14 16:49:18 -07001190 // Permission errors are ignored:
1191 // https://issuetracker.google.com/37553659
1192 // https://github.com/google/kati/pull/116
1193 return true
1194 }
1195 if pathErr.Err == os.ErrNotExist {
1196 // If a directory doesn't exist, that generally means the cache is out-of-date
1197 return true
1198 }
1199 // Don't recognize this error
1200 return false
1201}
1202
1203// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
1204func (f *Finder) onFsError(path string, err error) {
1205 if !f.canIgnoreFsErr(err) {
1206 // We could send the errors through a channel instead, although that would cause this call
1207 // to block unless we preallocated a sufficient buffer or spawned a reader thread.
1208 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
1209 // more convenient to use a lock. Only in an unusual situation should this code be
1210 // invoked anyway.
1211 f.errlock.Lock()
1212 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
1213 f.errlock.Unlock()
1214 }
1215}
1216
1217// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
1218func (f *Finder) discardErrsForPrunedPaths() {
1219 // This function could be somewhat inefficient due to being single-threaded,
1220 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
1221 relevantErrs := make([]fsErr, 0, len(f.fsErrs))
1222 for _, fsErr := range f.fsErrs {
1223 path := fsErr.path
1224 node := f.nodes.GetNode(path, false)
1225 if node != nil {
1226 // The path in question wasn't pruned due to a failure to process a parent directory.
1227 // So, the failure to process this path is important
1228 relevantErrs = append(relevantErrs, fsErr)
1229 }
1230 }
1231 f.fsErrs = relevantErrs
1232}
1233
1234// getErr returns an error based on previous calls to onFsErr, if any
1235func (f *Finder) getErr() error {
1236 f.discardErrsForPrunedPaths()
1237
1238 numErrs := len(f.fsErrs)
1239 if numErrs < 1 {
1240 return nil
1241 }
1242
1243 maxNumErrsToInclude := 10
1244 message := ""
1245 if numErrs > maxNumErrsToInclude {
1246 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
1247 } else {
1248 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
1249 }
1250
1251 return errors.New(message)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001252}
1253
1254func (f *Finder) statDirAsync(dir *pathMap) {
1255 node := dir
1256 path := dir.path
1257 f.threadPool.Run(
1258 func() {
1259 updatedStats := f.statDirSync(path)
1260
1261 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1262 node.mapNode = mapNode{
1263 statResponse: updatedStats,
1264 FileNames: []string{},
1265 }
1266 f.setModified()
1267 if node.statResponse.ModTime != 0 {
1268 // modification time was updated, so re-scan for
1269 // child directories
1270 f.listDirAsync(dir)
1271 }
1272 }
1273 },
1274 )
1275}
1276
1277func (f *Finder) statDirSync(path string) statResponse {
1278
1279 fileInfo, err := f.filesystem.Lstat(path)
1280
1281 var stats statResponse
1282 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001283 // possibly record this error
1284 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001285 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1286 return stats
1287 }
1288 modTime := fileInfo.ModTime()
1289 stats = statResponse{}
1290 inode, err := f.filesystem.InodeNumber(fileInfo)
1291 if err != nil {
1292 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1293 }
1294 stats.Inode = inode
1295 device, err := f.filesystem.DeviceNumber(fileInfo)
1296 if err != nil {
1297 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1298 }
1299 stats.Device = device
1300 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1301
1302 if err != nil {
1303 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1304 }
1305 // We're only interested in knowing whether anything about the directory
1306 // has changed since last check, so we use the latest of the two
1307 // modification times (content modification (mtime) and
1308 // permission modification (ctime))
1309 if permissionsChangeTime.After(modTime) {
1310 modTime = permissionsChangeTime
1311 }
1312 stats.ModTime = modTime.UnixNano()
1313
1314 return stats
1315}
1316
Chris Parsonsa798d962020-10-12 23:44:08 -04001317func (f *Finder) shouldIncludeFile(fileName string) bool {
1318 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1319 if fileName == includedName {
1320 return true
1321 }
1322 }
1323 for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes {
1324 if strings.HasSuffix(fileName, includeSuffix) {
1325 return true
1326 }
1327 }
1328 return false
1329}
1330
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001331// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1332func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1333
1334 for _, fileName := range items.FileNames {
1335 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1336 if fileName == abortedName {
1337 items.FileNames = []string{}
1338 items.DirNames = []string{}
1339 return
1340 }
1341 }
1342 }
1343
1344 // remove any files that aren't the ones we want to include
1345 writeIndex := 0
1346 for _, fileName := range items.FileNames {
Chris Parsonsa798d962020-10-12 23:44:08 -04001347 if f.shouldIncludeFile(fileName) {
1348 items.FileNames[writeIndex] = fileName
1349 writeIndex++
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001350 }
1351 }
1352 // resize
1353 items.FileNames = items.FileNames[:writeIndex]
1354
1355 writeIndex = 0
1356 for _, dirName := range items.DirNames {
1357 items.DirNames[writeIndex] = dirName
1358 // ignore other dirs that are known to not be inputs to the build process
1359 include := true
1360 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1361 if dirName == excludedName {
1362 // don't include
1363 include = false
1364 break
1365 }
1366 }
1367 if include {
1368 writeIndex++
1369 }
1370 }
1371 // resize
1372 items.DirNames = items.DirNames[:writeIndex]
1373}
1374
1375func (f *Finder) listDirsAsync(nodes []*pathMap) {
1376 f.threadPool.Run(
1377 func() {
1378 for i := range nodes {
1379 f.listDirSync(nodes[i])
1380 }
1381 },
1382 )
1383}
1384
1385func (f *Finder) listDirAsync(node *pathMap) {
1386 f.threadPool.Run(
1387 func() {
1388 f.listDirSync(node)
1389 },
1390 )
1391}
1392
1393func (f *Finder) listDirSync(dir *pathMap) {
1394 path := dir.path
1395 children, err := f.filesystem.ReadDir(path)
1396
1397 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001398 // possibly record this error
1399 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001400 // if listing the contents of the directory fails (presumably due to
1401 // permission denied), then treat the directory as empty
Colin Crossa88c8832017-12-21 15:39:26 -08001402 children = nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001403 }
1404
1405 var subdirs []string
1406 var subfiles []string
1407
1408 for _, child := range children {
1409 linkBits := child.Mode() & os.ModeSymlink
1410 isLink := linkBits != 0
Colin Cross25fd7732020-06-29 23:11:55 -07001411 if isLink {
1412 childPath := filepath.Join(path, child.Name())
1413 childStat, err := f.filesystem.Stat(childPath)
1414 if err != nil {
1415 // If stat fails this is probably a broken or dangling symlink, treat it as a file.
1416 subfiles = append(subfiles, child.Name())
1417 } else if childStat.IsDir() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001418 // Skip symlink dirs.
1419 // We don't have to support symlink dirs because
1420 // that would cause duplicates.
Colin Cross25fd7732020-06-29 23:11:55 -07001421 } else {
1422 // We do have to support symlink files because the link name might be
1423 // different than the target name
1424 // (for example, Android.bp -> build/soong/root.bp)
1425 subfiles = append(subfiles, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001426 }
Colin Cross25fd7732020-06-29 23:11:55 -07001427 } else if child.IsDir() {
1428 subdirs = append(subdirs, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001429 } else {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001430 subfiles = append(subfiles, child.Name())
1431 }
1432
1433 }
1434 parentNode := dir
1435
1436 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1437 f.pruneCacheCandidates(entry)
1438
1439 // create a pathMap node for each relevant subdirectory
1440 relevantChildren := map[string]*pathMap{}
1441 for _, subdirName := range entry.DirNames {
1442 childNode, found := parentNode.children[subdirName]
1443 // if we already knew of this directory, then we already have a request pending to Stat it
1444 // if we didn't already know of this directory, then we must Stat it now
1445 if !found {
1446 childNode = parentNode.newChild(subdirName)
1447 f.statDirAsync(childNode)
1448 }
1449 relevantChildren[subdirName] = childNode
1450 }
1451 // Note that in rare cases, it's possible that we're reducing the set of
1452 // children via this statement, if these are all true:
1453 // 1. we previously had a cache that knew about subdirectories of parentNode
1454 // 2. the user created a prune-file (described in pruneCacheCandidates)
1455 // inside <parentNode>, which specifies that the contents of parentNode
1456 // are to be ignored.
1457 // The fact that it's possible to remove children here means that *pathMap structs
1458 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1459 // direct pointer) until after every listDirSync completes
1460 parentNode.FileNames = entry.FileNames
1461 parentNode.children = relevantChildren
1462
1463}
1464
1465// listMatches takes a node and a function that specifies which subdirectories and
1466// files to include, and listMatches returns the matches
1467func (f *Finder) listMatches(node *pathMap,
1468 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1469 entries := DirEntries{
1470 FileNames: node.FileNames,
1471 }
1472 entries.DirNames = make([]string, 0, len(node.children))
1473 for childName := range node.children {
1474 entries.DirNames = append(entries.DirNames, childName)
1475 }
1476
1477 dirNames, fileNames := filter(entries)
1478
1479 subDirs = []*pathMap{}
1480 filePaths = make([]string, 0, len(fileNames))
1481 for _, fileName := range fileNames {
1482 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1483 }
1484 subDirs = make([]*pathMap, 0, len(dirNames))
1485 for _, childName := range dirNames {
1486 child, ok := node.children[childName]
1487 if ok {
1488 subDirs = append(subDirs, child)
1489 }
1490 }
1491
1492 return subDirs, filePaths
1493}
1494
1495// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1496func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1497 approxNumThreads int) []string {
1498
1499 if approxNumThreads < 2 {
1500 // Done spawning threads; process remaining directories
1501 return f.findInCacheSinglethreaded(node, filter)
1502 }
1503
1504 totalWork := 0
1505 for _, child := range node.children {
1506 totalWork += child.approximateNumDescendents
1507 }
1508 childrenResults := make(chan []string, len(node.children))
1509
1510 subDirs, filePaths := f.listMatches(node, filter)
1511
1512 // process child directories
1513 for _, child := range subDirs {
1514 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1515 childProcessor := func(child *pathMap) {
1516 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1517 childrenResults <- childResults
1518 }
1519 // If we're allowed to use more than 1 thread to process this directory,
1520 // then instead we use 1 thread for each subdirectory.
1521 // It would be strange to spawn threads for only some subdirectories.
1522 go childProcessor(child)
1523 }
1524
1525 // collect results
1526 for i := 0; i < len(subDirs); i++ {
1527 childResults := <-childrenResults
1528 filePaths = append(filePaths, childResults...)
1529 }
1530 close(childrenResults)
1531
1532 return filePaths
1533}
1534
1535// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1536// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1537func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1538 if node == nil {
1539 return []string{}
1540 }
1541
1542 nodes := []*pathMap{node}
1543 matches := []string{}
1544
1545 for len(nodes) > 0 {
1546 currentNode := nodes[0]
1547 nodes = nodes[1:]
1548
1549 subDirs, filePaths := f.listMatches(currentNode, filter)
1550
1551 nodes = append(nodes, subDirs...)
1552
1553 matches = append(matches, filePaths...)
1554 }
1555 return matches
1556}