blob: 6513fa301ec5e9d581ba00b5ad5116142b86ad34 [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
Jeff Gastond3119522017-08-22 14:11:15 -070021 "errors"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070022 "fmt"
23 "io"
24 "os"
25 "path/filepath"
26 "runtime"
27 "sort"
28 "strings"
29 "sync"
30 "sync/atomic"
31 "time"
32
Colin Cross8d6395c2017-12-21 15:46:01 -080033 "android/soong/finder/fs"
Jeff Gastonf1fd45e2017-08-09 18:25:28 -070034)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
97 // ExcludeDirs are directory names that if encountered are removed from the search
98 ExcludeDirs []string
99
100 // PruneFiles are file names that if encountered prune their entire directory
101 // (including siblings)
102 PruneFiles []string
103
104 // IncludeFiles are file names to include as matches
105 IncludeFiles []string
106}
107
108// a cacheConfig stores the inputs that determine what should be included in the cache
109type cacheConfig struct {
110 CacheParams
111
112 // FilesystemView is a unique identifier telling which parts of which file systems
113 // are readable by the Finder. In practice its value is essentially username@hostname.
114 // FilesystemView is set to ensure that a cache file copied to another host or
115 // found by another user doesn't inadvertently get reused.
116 FilesystemView string
117}
118
119func (p *cacheConfig) Dump() ([]byte, error) {
120 bytes, err := json.Marshal(p)
121 return bytes, err
122}
123
124// a cacheMetadata stores version information about the cache
125type cacheMetadata struct {
126 // The Version enables the Finder to determine whether it can even parse the file
127 // If the version changes, the entire cache file must be regenerated
128 Version string
129
130 // The CacheParams enables the Finder to determine whether the parameters match
131 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
132 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
133 Config cacheConfig
134}
135
136type Logger interface {
137 Output(calldepth int, s string) error
138}
139
140// the Finder is the main struct that callers will want to use
141type Finder struct {
142 // configuration
143 DbPath string
144 numDbLoadingThreads int
145 numSearchingThreads int
146 cacheMetadata cacheMetadata
147 logger Logger
148 filesystem fs.FileSystem
149
150 // temporary state
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700151 threadPool *threadPool
152 mutex sync.Mutex
153 fsErrs []fsErr
154 errlock sync.Mutex
155 shutdownWaitgroup sync.WaitGroup
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700156
157 // non-temporary state
158 modifiedFlag int32
159 nodes pathMap
160}
161
Jeff Gastond3119522017-08-22 14:11:15 -0700162var defaultNumThreads = runtime.NumCPU() * 2
163
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700164// New creates a new Finder for use
165func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gastonb629e182017-08-14 16:49:18 -0700166 logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gastond3119522017-08-22 14:11:15 -0700167 return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads)
168}
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700169
Jeff Gastond3119522017-08-22 14:11:15 -0700170// newImpl is like New but accepts more params
171func newImpl(cacheParams CacheParams, filesystem fs.FileSystem,
172 logger Logger, dbPath string, numThreads int) (f *Finder, err error) {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700173 numDbLoadingThreads := numThreads
174 numSearchingThreads := numThreads
175
176 metadata := cacheMetadata{
177 Version: versionString,
178 Config: cacheConfig{
179 CacheParams: cacheParams,
180 FilesystemView: filesystem.ViewId(),
181 },
182 }
183
Jeff Gastonb629e182017-08-14 16:49:18 -0700184 f = &Finder{
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700185 numDbLoadingThreads: numDbLoadingThreads,
186 numSearchingThreads: numSearchingThreads,
187 cacheMetadata: metadata,
188 logger: logger,
189 filesystem: filesystem,
190
191 nodes: *newPathMap("/"),
192 DbPath: dbPath,
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700193
194 shutdownWaitgroup: sync.WaitGroup{},
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700195 }
196
Jeff Gastonb629e182017-08-14 16:49:18 -0700197 f.loadFromFilesystem()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700198
Jeff Gastonb629e182017-08-14 16:49:18 -0700199 // check for any filesystem errors
200 err = f.getErr()
201 if err != nil {
202 return nil, err
203 }
204
205 // confirm that every path mentioned in the CacheConfig exists
206 for _, path := range cacheParams.RootDirs {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700207 if !filepath.IsAbs(path) {
208 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
209 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700210 node := f.nodes.GetNode(filepath.Clean(path), false)
211 if node == nil || node.ModTime == 0 {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700212 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gastonb629e182017-08-14 16:49:18 -0700213 }
214 }
215
216 return f, nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700217}
218
219// FindNamed searches for every cached file
220func (f *Finder) FindAll() []string {
221 return f.FindAt("/")
222}
223
224// FindNamed searches for every cached file under <rootDir>
225func (f *Finder) FindAt(rootDir string) []string {
226 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
227 return entries.DirNames, entries.FileNames
228 }
229 return f.FindMatching(rootDir, filter)
230}
231
232// FindNamed searches for every cached file named <fileName>
233func (f *Finder) FindNamed(fileName string) []string {
234 return f.FindNamedAt("/", fileName)
235}
236
237// FindNamedAt searches under <rootPath> for every file named <fileName>
238// The reason a caller might use FindNamedAt instead of FindNamed is if they want
239// to limit their search to a subset of the cache
240func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
241 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
242 matches := []string{}
243 for _, foundName := range entries.FileNames {
244 if foundName == fileName {
245 matches = append(matches, foundName)
246 }
247 }
248 return entries.DirNames, matches
249
250 }
251 return f.FindMatching(rootPath, filter)
252}
253
254// FindFirstNamed searches for every file named <fileName>
255// Whenever it finds a match, it stops search subdirectories
256func (f *Finder) FindFirstNamed(fileName string) []string {
257 return f.FindFirstNamedAt("/", fileName)
258}
259
260// FindFirstNamedAt searches for every file named <fileName>
261// Whenever it finds a match, it stops search subdirectories
262func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
263 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
264 matches := []string{}
265 for _, foundName := range entries.FileNames {
266 if foundName == fileName {
267 matches = append(matches, foundName)
268 }
269 }
270
271 if len(matches) > 0 {
272 return []string{}, matches
273 }
274 return entries.DirNames, matches
275 }
276 return f.FindMatching(rootPath, filter)
277}
278
279// FindMatching is the most general exported function for searching for files in the cache
280// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
281// in place, removing file paths and directories as desired.
282// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
283func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
284 // set up some parameters
285 scanStart := time.Now()
286 var isRel bool
287 workingDir := f.cacheMetadata.Config.WorkingDirectory
288
289 isRel = !filepath.IsAbs(rootPath)
290 if isRel {
291 rootPath = filepath.Join(workingDir, rootPath)
292 }
293
294 rootPath = filepath.Clean(rootPath)
295
296 // ensure nothing else is using the Finder
297 f.verbosef("FindMatching waiting for finder to be idle\n")
298 f.lock()
299 defer f.unlock()
300
301 node := f.nodes.GetNode(rootPath, false)
302 if node == nil {
303 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
304 rootPath, f.cacheMetadata.Config.CacheParams)
305 // path is not found; don't do a search
306 return []string{}
307 }
308
309 // search for matching files
310 f.verbosef("Finder finding %v using cache\n", rootPath)
311 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
312
313 // format and return results
314 if isRel {
315 for i := 0; i < len(results); i++ {
316 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
317 }
318 }
319 sort.Strings(results)
320 f.verbosef("Found %v files under %v in %v using cache\n",
321 len(results), rootPath, time.Since(scanStart))
322 return results
323}
324
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700325// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
326// Currently, that only entails waiting for the database dump to complete.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700327func (f *Finder) Shutdown() {
Colin Cross96e5e412020-07-06 17:13:43 -0700328 f.WaitForDbDump()
329}
330
331// WaitForDbDump returns once the database has been written to f.DbPath.
332func (f *Finder) WaitForDbDump() {
333 f.shutdownWaitgroup.Wait()
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700334}
335
336// End of public api
337
338func (f *Finder) goDumpDb() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700339 if f.wasModified() {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700340 f.shutdownWaitgroup.Add(1)
341 go func() {
342 err := f.dumpDb()
343 if err != nil {
344 f.verbosef("%v\n", err)
345 }
346 f.shutdownWaitgroup.Done()
347 }()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700348 } else {
349 f.verbosef("Skipping dumping unmodified db\n")
350 }
351}
352
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700353// joinCleanPaths is like filepath.Join but is faster because
354// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
355func joinCleanPaths(base string, leaf string) string {
356 if base == "" {
357 return leaf
358 }
359 if base == "/" {
360 return base + leaf
361 }
362 if leaf == "" {
363 return base
364 }
365 return base + "/" + leaf
366}
367
368func (f *Finder) verbosef(format string, args ...interface{}) {
369 f.logger.Output(2, fmt.Sprintf(format, args...))
370}
371
372// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
373func (f *Finder) loadFromFilesystem() {
374 f.threadPool = newThreadPool(f.numDbLoadingThreads)
375
376 err := f.startFromExternalCache()
377 if err != nil {
378 f.startWithoutExternalCache()
379 }
380
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700381 f.goDumpDb()
382
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700383 f.threadPool = nil
384}
385
386func (f *Finder) startFind(path string) {
387 if !filepath.IsAbs(path) {
388 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
389 }
390 node := f.nodes.GetNode(path, true)
391 f.statDirAsync(node)
392}
393
394func (f *Finder) lock() {
395 f.mutex.Lock()
396}
397
398func (f *Finder) unlock() {
399 f.mutex.Unlock()
400}
401
402// a statResponse is the relevant portion of the response from the filesystem to a Stat call
403type statResponse struct {
404 ModTime int64
405 Inode uint64
406 Device uint64
407}
408
409// a pathAndStats stores a path and its stats
410type pathAndStats struct {
411 statResponse
412
413 Path string
414}
415
416// a dirFullInfo stores all of the relevant information we know about a directory
417type dirFullInfo struct {
418 pathAndStats
419
420 FileNames []string
421}
422
423// a PersistedDirInfo is the information about a dir that we save to our cache on disk
424type PersistedDirInfo struct {
425 // These field names are short because they are repeated many times in the output json file
426 P string // path
427 T int64 // modification time
428 I uint64 // inode number
429 F []string // relevant filenames contained
430}
431
432// a PersistedDirs is the information that we persist for a group of dirs
433type PersistedDirs struct {
434 // the device on which each directory is stored
435 Device uint64
436 // the common root path to which all contained dirs are relative
437 Root string
438 // the directories themselves
439 Dirs []PersistedDirInfo
440}
441
442// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
443type CacheEntry []PersistedDirs
444
445// a DirEntries lists the files and directories contained directly within a specific directory
446type DirEntries struct {
447 Path string
448
449 // elements of DirNames are just the dir names; they don't include any '/' character
450 DirNames []string
451 // elements of FileNames are just the file names; they don't include '/' character
452 FileNames []string
453}
454
455// a WalkFunc is the type that is passed into various Find functions for determining which
456// directories the caller wishes be walked. The WalkFunc is expected to decide which
457// directories to walk and which files to consider as matches to the original query.
458type WalkFunc func(DirEntries) (dirs []string, files []string)
459
460// a mapNode stores the relevant stats about a directory to be stored in a pathMap
461type mapNode struct {
462 statResponse
463 FileNames []string
464}
465
466// a pathMap implements the directory tree structure of nodes
467type pathMap struct {
468 mapNode
469
470 path string
471
472 children map[string]*pathMap
473
474 // number of descendent nodes, including self
475 approximateNumDescendents int
476}
477
478func newPathMap(path string) *pathMap {
479 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
480 approximateNumDescendents: 1}
481 return result
482}
483
484// GetNode returns the node at <path>
485func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
486 if len(path) > 0 && path[0] == '/' {
487 path = path[1:]
488 }
489
490 node := m
491 for {
492 if path == "" {
493 return node
494 }
495
496 index := strings.Index(path, "/")
497 var firstComponent string
498 if index >= 0 {
499 firstComponent = path[:index]
500 path = path[index+1:]
501 } else {
502 firstComponent = path
503 path = ""
504 }
505
506 child, found := node.children[firstComponent]
507
508 if !found {
509 if createIfNotFound {
510 child = node.newChild(firstComponent)
511 } else {
512 return nil
513 }
514 }
515
516 node = child
517 }
518}
519
520func (m *pathMap) newChild(name string) (child *pathMap) {
521 path := joinCleanPaths(m.path, name)
522 newChild := newPathMap(path)
523 m.children[name] = newChild
524
525 return m.children[name]
526}
527
528func (m *pathMap) UpdateNumDescendents() int {
529 count := 1
530 for _, child := range m.children {
531 count += child.approximateNumDescendents
532 }
533 m.approximateNumDescendents = count
534 return count
535}
536
537func (m *pathMap) UpdateNumDescendentsRecursive() {
538 for _, child := range m.children {
539 child.UpdateNumDescendentsRecursive()
540 }
541 m.UpdateNumDescendents()
542}
543
544func (m *pathMap) MergeIn(other *pathMap) {
545 for key, theirs := range other.children {
546 ours, found := m.children[key]
547 if found {
548 ours.MergeIn(theirs)
549 } else {
550 m.children[key] = theirs
551 }
552 }
553 if other.ModTime != 0 {
554 m.mapNode = other.mapNode
555 }
556 m.UpdateNumDescendents()
557}
558
559func (m *pathMap) DumpAll() []dirFullInfo {
560 results := []dirFullInfo{}
561 m.dumpInto("", &results)
562 return results
563}
564
565func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
566 *results = append(*results,
567 dirFullInfo{
568 pathAndStats{statResponse: m.statResponse, Path: path},
569 m.FileNames},
570 )
571 for key, child := range m.children {
572 childPath := joinCleanPaths(path, key)
573 if len(childPath) == 0 || childPath[0] != '/' {
574 childPath = "/" + childPath
575 }
576 child.dumpInto(childPath, results)
577 }
578}
579
580// a semaphore can be locked by up to <capacity> callers at once
581type semaphore struct {
582 pool chan bool
583}
584
585func newSemaphore(capacity int) *semaphore {
586 return &semaphore{pool: make(chan bool, capacity)}
587}
588
589func (l *semaphore) Lock() {
590 l.pool <- true
591}
592
593func (l *semaphore) Unlock() {
594 <-l.pool
595}
596
597// A threadPool runs goroutines and supports throttling and waiting.
598// Without throttling, Go may exhaust the maximum number of various resources, such as
599// threads or file descriptors, and crash the program.
600type threadPool struct {
601 receivedRequests sync.WaitGroup
602 activeRequests semaphore
603}
604
605func newThreadPool(maxNumConcurrentThreads int) *threadPool {
606 return &threadPool{
607 receivedRequests: sync.WaitGroup{},
608 activeRequests: *newSemaphore(maxNumConcurrentThreads),
609 }
610}
611
612// Run requests to run the given function in its own goroutine
613func (p *threadPool) Run(function func()) {
614 p.receivedRequests.Add(1)
615 // If Run() was called from within a goroutine spawned by this threadPool,
616 // then we may need to return from Run() before having capacity to actually
617 // run <function>.
618 //
619 // It's possible that the body of <function> contains a statement (such as a syscall)
620 // that will cause Go to pin it to a thread, or will contain a statement that uses
621 // another resource that is in short supply (such as a file descriptor), so we can't
622 // actually run <function> until we have capacity.
623 //
624 // However, the semaphore used for synchronization is implemented via a channel and
625 // shouldn't require a new thread for each access.
626 go func() {
627 p.activeRequests.Lock()
628 function()
629 p.activeRequests.Unlock()
630 p.receivedRequests.Done()
631 }()
632}
633
634// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
635func (p *threadPool) Wait() {
636 p.receivedRequests.Wait()
637}
638
Jeff Gastonb629e182017-08-14 16:49:18 -0700639type fsErr struct {
640 path string
641 err error
642}
643
644func (e fsErr) String() string {
645 return e.path + ": " + e.err.Error()
646}
647
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700648func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
649 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
650 dirsByDevice := map[uint64][]PersistedDirInfo{}
651 for _, entry := range dirInfos {
652 _, found := dirsByDevice[entry.Device]
653 if !found {
654 dirsByDevice[entry.Device] = []PersistedDirInfo{}
655 }
656 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
657 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
658 }
659
660 cacheEntry := CacheEntry{}
661
662 for device, infos := range dirsByDevice {
663 // find common prefix
664 prefix := ""
665 if len(infos) > 0 {
666 prefix = infos[0].P
667 }
668 for _, info := range infos {
669 for !strings.HasPrefix(info.P+"/", prefix+"/") {
670 prefix = filepath.Dir(prefix)
Jeff Gaston996716a2017-08-22 13:33:19 -0700671 if prefix == "/" {
672 break
673 }
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700674 }
675 }
676 // remove common prefix
677 for i := range infos {
678 suffix := strings.Replace(infos[i].P, prefix, "", 1)
679 if len(suffix) > 0 && suffix[0] == '/' {
680 suffix = suffix[1:]
681 }
682 infos[i].P = suffix
683 }
684
685 // turn the map (keyed by device) into a list of structs with labeled fields
686 // this is to improve readability of the output
687 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
688 }
689
690 // convert to json.
691 // it would save some space to use a different format than json for the db file,
692 // but the space and time savings are small, and json is easy for humans to read
693 bytes, err := json.Marshal(cacheEntry)
694 return bytes, err
695}
696
697func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
698 var cacheEntry CacheEntry
699 err := json.Unmarshal(bytes, &cacheEntry)
700 if err != nil {
701 return nil, err
702 }
703
704 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
705 capacity := 0
706 for _, element := range cacheEntry {
707 capacity += len(element.Dirs)
708 }
709 nodes := make([]dirFullInfo, capacity)
710 count := 0
711 for _, element := range cacheEntry {
712 for _, dir := range element.Dirs {
713 path := joinCleanPaths(element.Root, dir.P)
714
715 nodes[count] = dirFullInfo{
716 pathAndStats: pathAndStats{
717 statResponse: statResponse{
718 ModTime: dir.T, Inode: dir.I, Device: element.Device,
719 },
720 Path: path},
721 FileNames: dir.F}
722 count++
723 }
724 }
725 return nodes, nil
726}
727
728// We use the following separator byte to distinguish individually parseable blocks of json
729// because we know this separator won't appear in the json that we're parsing.
730//
731// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
732// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
733// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
734// character.
735//
736// We know that the newline character will never appear in our json string, because:
737// - If a newline character appears as part of a data string, then json encoding will
738// emit two characters instead: '\' and 'n'.
739// - The json encoder that we use doesn't emit the optional newlines between any of its
740// other outputs.
741const lineSeparator = byte('\n')
742
743func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
744 return reader.ReadBytes(lineSeparator)
745}
746
747// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
748func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
749 cacheVersionBytes, err := f.readLine(cacheReader)
750 if err != nil {
751 f.verbosef("Failed to read database header; database is invalid\n")
752 return false
753 }
754 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
755 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
756 }
757 cacheVersionString := string(cacheVersionBytes)
758 currentVersion := f.cacheMetadata.Version
759 if cacheVersionString != currentVersion {
760 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
761 return false
762 }
763
764 cacheParamBytes, err := f.readLine(cacheReader)
765 if err != nil {
766 f.verbosef("Failed to read database search params; database is invalid\n")
767 return false
768 }
769
770 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
771 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
772 }
773
774 currentParamBytes, err := f.cacheMetadata.Config.Dump()
775 if err != nil {
776 panic("Finder failed to serialize its parameters")
777 }
778 cacheParamString := string(cacheParamBytes)
779 currentParamString := string(currentParamBytes)
780 if cacheParamString != currentParamString {
781 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
782 return false
783 }
784 return true
785}
786
787// loadBytes compares the cache info in <data> to the state of the filesystem
788// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
789func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
790
791 helperStartTime := time.Now()
792
793 cachedNodes, err := f.parseCacheEntry(data)
794 if err != nil {
795 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
796 }
797
798 unmarshalDate := time.Now()
799 f.verbosef("Unmarshaled %v objects for %v in %v\n",
800 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
801
802 tempMap := newPathMap("/")
803 stats := make([]statResponse, len(cachedNodes))
804
805 for i, node := range cachedNodes {
806 // check the file system for an updated timestamp
807 stats[i] = f.statDirSync(node.Path)
808 }
809
810 dirsToWalk = []string{}
811 for i, cachedNode := range cachedNodes {
812 updated := stats[i]
813 // save the cached value
814 container := tempMap.GetNode(cachedNode.Path, true)
815 container.mapNode = mapNode{statResponse: updated}
816
817 // if the metadata changed and the directory still exists, then
818 // make a note to walk it later
819 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
820 f.setModified()
821 // make a note that the directory needs to be walked
822 dirsToWalk = append(dirsToWalk, cachedNode.Path)
823 } else {
824 container.mapNode.FileNames = cachedNode.FileNames
825 }
826 }
827 // count the number of nodes to improve our understanding of the shape of the tree,
828 // thereby improving parallelism of subsequent searches
829 tempMap.UpdateNumDescendentsRecursive()
830
831 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
832 return tempMap, dirsToWalk, nil
833}
834
835// startFromExternalCache loads the cache database from disk
836// startFromExternalCache waits to return until the load of the cache db is complete, but
837// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
838func (f *Finder) startFromExternalCache() (err error) {
839 startTime := time.Now()
840 dbPath := f.DbPath
841
842 // open cache file and validate its header
843 reader, err := f.filesystem.Open(dbPath)
844 if err != nil {
845 return errors.New("No data to load from database\n")
846 }
847 bufferedReader := bufio.NewReader(reader)
848 if !f.validateCacheHeader(bufferedReader) {
849 return errors.New("Cache header does not match")
850 }
851 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
852
853 // read the file and spawn threads to process it
854 nodesToWalk := [][]*pathMap{}
855 mainTree := newPathMap("/")
856
857 // read the blocks and stream them into <blockChannel>
858 type dataBlock struct {
859 id int
860 err error
861 data []byte
862 }
863 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
864 readBlocks := func() {
865 index := 0
866 for {
867 // It takes some time to unmarshal the input from json, so we want
868 // to unmarshal it in parallel. In order to find valid places to
869 // break the input, we scan for the line separators that we inserted
870 // (for this purpose) when we dumped the database.
871 data, err := f.readLine(bufferedReader)
872 var response dataBlock
873 done := false
874 if err != nil && err != io.EOF {
875 response = dataBlock{id: index, err: err, data: nil}
876 done = true
877 } else {
878 done = (err == io.EOF)
879 response = dataBlock{id: index, err: nil, data: data}
880 }
881 blockChannel <- response
882 index++
883 duration := time.Since(startTime)
884 f.verbosef("Read block %v after %v\n", index, duration)
885 if done {
886 f.verbosef("Read %v blocks in %v\n", index, duration)
887 close(blockChannel)
888 return
889 }
890 }
891 }
892 go readBlocks()
893
894 // Read from <blockChannel> and stream the responses into <resultChannel>.
895 type workResponse struct {
896 id int
897 err error
898 tree *pathMap
899 updatedDirs []string
900 }
901 resultChannel := make(chan workResponse)
902 processBlocks := func() {
903 numProcessed := 0
904 threadPool := newThreadPool(f.numDbLoadingThreads)
905 for {
906 // get a block to process
907 block, received := <-blockChannel
908 if !received {
909 break
910 }
911
912 if block.err != nil {
913 resultChannel <- workResponse{err: block.err}
914 break
915 }
916 numProcessed++
917 // wait until there is CPU available to process it
918 threadPool.Run(
919 func() {
920 processStartTime := time.Now()
921 f.verbosef("Starting to process block %v after %v\n",
922 block.id, processStartTime.Sub(startTime))
923 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
924 var response workResponse
925 if err != nil {
926 f.verbosef(
927 "Block %v failed to parse with error %v\n",
928 block.id, err)
929 response = workResponse{err: err}
930 } else {
931 response = workResponse{
932 id: block.id,
933 err: nil,
934 tree: tempMap,
935 updatedDirs: updatedDirs,
936 }
937 }
938 f.verbosef("Processed block %v in %v\n",
939 block.id, time.Since(processStartTime),
940 )
941 resultChannel <- response
942 },
943 )
944 }
945 threadPool.Wait()
946 f.verbosef("Finished processing %v blocks in %v\n",
947 numProcessed, time.Since(startTime))
948 close(resultChannel)
949 }
950 go processBlocks()
951
952 // Read from <resultChannel> and use the results
953 combineResults := func() (err error) {
954 for {
955 result, received := <-resultChannel
956 if !received {
957 break
958 }
959 if err != nil {
960 // In case of an error, wait for work to complete before
961 // returning the error. This ensures that any subsequent
962 // work doesn't need to compete for resources (and possibly
963 // fail due to, for example, a filesystem limit on the number of
964 // concurrently open files) with past work.
965 continue
966 }
967 if result.err != nil {
968 err = result.err
969 continue
970 }
971 // update main tree
972 mainTree.MergeIn(result.tree)
973 // record any new directories that we will need to Stat()
974 updatedNodes := make([]*pathMap, len(result.updatedDirs))
975 for j, dir := range result.updatedDirs {
976 node := mainTree.GetNode(dir, false)
977 updatedNodes[j] = node
978 }
979 nodesToWalk = append(nodesToWalk, updatedNodes)
980 }
981 return err
982 }
983 err = combineResults()
984 if err != nil {
985 return err
986 }
987
988 f.nodes = *mainTree
989
990 // after having loaded the entire db and therefore created entries for
991 // the directories we know of, now it's safe to start calling ReadDir on
992 // any updated directories
993 for i := range nodesToWalk {
994 f.listDirsAsync(nodesToWalk[i])
995 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700996 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
997 f.threadPool.Wait()
998 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
999
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001000 return err
1001}
1002
1003// startWithoutExternalCache starts scanning the filesystem according to the cache config
1004// startWithoutExternalCache should be called if startFromExternalCache is not applicable
1005func (f *Finder) startWithoutExternalCache() {
Jeff Gastonb629e182017-08-14 16:49:18 -07001006 startTime := time.Now()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001007 configDirs := f.cacheMetadata.Config.RootDirs
1008
1009 // clean paths
1010 candidates := make([]string, len(configDirs))
1011 for i, dir := range configDirs {
1012 candidates[i] = filepath.Clean(dir)
1013 }
1014 // remove duplicates
1015 dirsToScan := make([]string, 0, len(configDirs))
1016 for _, candidate := range candidates {
1017 include := true
1018 for _, included := range dirsToScan {
1019 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
1020 include = false
1021 break
1022 }
1023 }
1024 if include {
1025 dirsToScan = append(dirsToScan, candidate)
1026 }
1027 }
1028
1029 // start searching finally
1030 for _, path := range dirsToScan {
1031 f.verbosef("Starting find of %v\n", path)
1032 f.startFind(path)
1033 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001034
1035 f.threadPool.Wait()
1036
1037 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001038}
1039
1040// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
1041func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
1042 if old.Inode != new.Inode {
1043 return false
1044 }
1045 if old.ModTime != new.ModTime {
1046 return false
1047 }
1048 if old.Device != new.Device {
1049 return false
1050 }
1051 return true
1052}
1053
1054func (f *Finder) wasModified() bool {
1055 return atomic.LoadInt32(&f.modifiedFlag) > 0
1056}
1057
1058func (f *Finder) setModified() {
1059 var newVal int32
1060 newVal = 1
1061 atomic.StoreInt32(&f.modifiedFlag, newVal)
1062}
1063
1064// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1065func (f *Finder) sortedDirEntries() []dirFullInfo {
1066 startTime := time.Now()
1067 nodes := make([]dirFullInfo, 0)
1068 for _, node := range f.nodes.DumpAll() {
1069 if node.ModTime != 0 {
1070 nodes = append(nodes, node)
1071 }
1072 }
1073 discoveryDate := time.Now()
1074 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1075 less := func(i int, j int) bool {
1076 return nodes[i].Path < nodes[j].Path
1077 }
1078 sort.Slice(nodes, less)
1079 sortDate := time.Now()
1080 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1081
1082 return nodes
1083}
1084
1085// serializeDb converts the cache database into a form to save to disk
1086func (f *Finder) serializeDb() ([]byte, error) {
1087 // sort dir entries
1088 var entryList = f.sortedDirEntries()
1089
1090 // Generate an output file that can be conveniently loaded using the same number of threads
1091 // as were used in this execution (because presumably that will be the number of threads
1092 // used in the next execution too)
1093
1094 // generate header
1095 header := []byte{}
1096 header = append(header, []byte(f.cacheMetadata.Version)...)
1097 header = append(header, lineSeparator)
1098 configDump, err := f.cacheMetadata.Config.Dump()
1099 if err != nil {
1100 return nil, err
1101 }
1102 header = append(header, configDump...)
1103
1104 // serialize individual blocks in parallel
1105 numBlocks := f.numDbLoadingThreads
1106 if numBlocks > len(entryList) {
1107 numBlocks = len(entryList)
1108 }
1109 blocks := make([][]byte, 1+numBlocks)
1110 blocks[0] = header
1111 blockMin := 0
1112 wg := sync.WaitGroup{}
1113 var errLock sync.Mutex
1114
1115 for i := 1; i <= numBlocks; i++ {
1116 // identify next block
1117 blockMax := len(entryList) * i / numBlocks
1118 block := entryList[blockMin:blockMax]
1119
1120 // process block
1121 wg.Add(1)
1122 go func(index int, block []dirFullInfo) {
1123 byteBlock, subErr := f.serializeCacheEntry(block)
1124 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1125 if subErr != nil {
1126 f.verbosef("%v\n", subErr.Error())
1127 errLock.Lock()
1128 err = subErr
1129 errLock.Unlock()
1130 } else {
1131 blocks[index] = byteBlock
1132 }
1133 wg.Done()
1134 }(i, block)
1135
1136 blockMin = blockMax
1137 }
1138
1139 wg.Wait()
1140
1141 if err != nil {
1142 return nil, err
1143 }
1144
1145 content := bytes.Join(blocks, []byte{lineSeparator})
1146
1147 return content, nil
1148}
1149
1150// dumpDb saves the cache database to disk
1151func (f *Finder) dumpDb() error {
1152 startTime := time.Now()
1153 f.verbosef("Dumping db\n")
1154
1155 tempPath := f.DbPath + ".tmp"
1156
1157 bytes, err := f.serializeDb()
1158 if err != nil {
1159 return err
1160 }
1161 serializeDate := time.Now()
1162 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1163 // dump file and atomically move
1164 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1165 if err != nil {
1166 return err
1167 }
1168 err = f.filesystem.Rename(tempPath, f.DbPath)
1169 if err != nil {
1170 return err
1171 }
1172
1173 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1174 return nil
Jeff Gastonb629e182017-08-14 16:49:18 -07001175
1176}
1177
1178// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
1179func (f *Finder) canIgnoreFsErr(err error) bool {
1180 pathErr, isPathErr := err.(*os.PathError)
1181 if !isPathErr {
1182 // Don't recognize this error
1183 return false
1184 }
Jeff Gaston821271b2017-08-23 17:53:38 -07001185 if os.IsPermission(pathErr) {
Jeff Gastonb629e182017-08-14 16:49:18 -07001186 // Permission errors are ignored:
1187 // https://issuetracker.google.com/37553659
1188 // https://github.com/google/kati/pull/116
1189 return true
1190 }
1191 if pathErr.Err == os.ErrNotExist {
1192 // If a directory doesn't exist, that generally means the cache is out-of-date
1193 return true
1194 }
1195 // Don't recognize this error
1196 return false
1197}
1198
1199// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
1200func (f *Finder) onFsError(path string, err error) {
1201 if !f.canIgnoreFsErr(err) {
1202 // We could send the errors through a channel instead, although that would cause this call
1203 // to block unless we preallocated a sufficient buffer or spawned a reader thread.
1204 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
1205 // more convenient to use a lock. Only in an unusual situation should this code be
1206 // invoked anyway.
1207 f.errlock.Lock()
1208 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
1209 f.errlock.Unlock()
1210 }
1211}
1212
1213// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
1214func (f *Finder) discardErrsForPrunedPaths() {
1215 // This function could be somewhat inefficient due to being single-threaded,
1216 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
1217 relevantErrs := make([]fsErr, 0, len(f.fsErrs))
1218 for _, fsErr := range f.fsErrs {
1219 path := fsErr.path
1220 node := f.nodes.GetNode(path, false)
1221 if node != nil {
1222 // The path in question wasn't pruned due to a failure to process a parent directory.
1223 // So, the failure to process this path is important
1224 relevantErrs = append(relevantErrs, fsErr)
1225 }
1226 }
1227 f.fsErrs = relevantErrs
1228}
1229
1230// getErr returns an error based on previous calls to onFsErr, if any
1231func (f *Finder) getErr() error {
1232 f.discardErrsForPrunedPaths()
1233
1234 numErrs := len(f.fsErrs)
1235 if numErrs < 1 {
1236 return nil
1237 }
1238
1239 maxNumErrsToInclude := 10
1240 message := ""
1241 if numErrs > maxNumErrsToInclude {
1242 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
1243 } else {
1244 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
1245 }
1246
1247 return errors.New(message)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001248}
1249
1250func (f *Finder) statDirAsync(dir *pathMap) {
1251 node := dir
1252 path := dir.path
1253 f.threadPool.Run(
1254 func() {
1255 updatedStats := f.statDirSync(path)
1256
1257 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1258 node.mapNode = mapNode{
1259 statResponse: updatedStats,
1260 FileNames: []string{},
1261 }
1262 f.setModified()
1263 if node.statResponse.ModTime != 0 {
1264 // modification time was updated, so re-scan for
1265 // child directories
1266 f.listDirAsync(dir)
1267 }
1268 }
1269 },
1270 )
1271}
1272
1273func (f *Finder) statDirSync(path string) statResponse {
1274
1275 fileInfo, err := f.filesystem.Lstat(path)
1276
1277 var stats statResponse
1278 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001279 // possibly record this error
1280 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001281 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1282 return stats
1283 }
1284 modTime := fileInfo.ModTime()
1285 stats = statResponse{}
1286 inode, err := f.filesystem.InodeNumber(fileInfo)
1287 if err != nil {
1288 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1289 }
1290 stats.Inode = inode
1291 device, err := f.filesystem.DeviceNumber(fileInfo)
1292 if err != nil {
1293 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1294 }
1295 stats.Device = device
1296 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1297
1298 if err != nil {
1299 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1300 }
1301 // We're only interested in knowing whether anything about the directory
1302 // has changed since last check, so we use the latest of the two
1303 // modification times (content modification (mtime) and
1304 // permission modification (ctime))
1305 if permissionsChangeTime.After(modTime) {
1306 modTime = permissionsChangeTime
1307 }
1308 stats.ModTime = modTime.UnixNano()
1309
1310 return stats
1311}
1312
1313// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1314func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1315
1316 for _, fileName := range items.FileNames {
1317 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1318 if fileName == abortedName {
1319 items.FileNames = []string{}
1320 items.DirNames = []string{}
1321 return
1322 }
1323 }
1324 }
1325
1326 // remove any files that aren't the ones we want to include
1327 writeIndex := 0
1328 for _, fileName := range items.FileNames {
1329 // include only these files
1330 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1331 if fileName == includedName {
1332 items.FileNames[writeIndex] = fileName
1333 writeIndex++
1334 break
1335 }
1336 }
1337 }
1338 // resize
1339 items.FileNames = items.FileNames[:writeIndex]
1340
1341 writeIndex = 0
1342 for _, dirName := range items.DirNames {
1343 items.DirNames[writeIndex] = dirName
1344 // ignore other dirs that are known to not be inputs to the build process
1345 include := true
1346 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1347 if dirName == excludedName {
1348 // don't include
1349 include = false
1350 break
1351 }
1352 }
1353 if include {
1354 writeIndex++
1355 }
1356 }
1357 // resize
1358 items.DirNames = items.DirNames[:writeIndex]
1359}
1360
1361func (f *Finder) listDirsAsync(nodes []*pathMap) {
1362 f.threadPool.Run(
1363 func() {
1364 for i := range nodes {
1365 f.listDirSync(nodes[i])
1366 }
1367 },
1368 )
1369}
1370
1371func (f *Finder) listDirAsync(node *pathMap) {
1372 f.threadPool.Run(
1373 func() {
1374 f.listDirSync(node)
1375 },
1376 )
1377}
1378
1379func (f *Finder) listDirSync(dir *pathMap) {
1380 path := dir.path
1381 children, err := f.filesystem.ReadDir(path)
1382
1383 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001384 // possibly record this error
1385 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001386 // if listing the contents of the directory fails (presumably due to
1387 // permission denied), then treat the directory as empty
Colin Crossa88c8832017-12-21 15:39:26 -08001388 children = nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001389 }
1390
1391 var subdirs []string
1392 var subfiles []string
1393
1394 for _, child := range children {
1395 linkBits := child.Mode() & os.ModeSymlink
1396 isLink := linkBits != 0
Colin Cross25fd7732020-06-29 23:11:55 -07001397 if isLink {
1398 childPath := filepath.Join(path, child.Name())
1399 childStat, err := f.filesystem.Stat(childPath)
1400 if err != nil {
1401 // If stat fails this is probably a broken or dangling symlink, treat it as a file.
1402 subfiles = append(subfiles, child.Name())
1403 } else if childStat.IsDir() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001404 // Skip symlink dirs.
1405 // We don't have to support symlink dirs because
1406 // that would cause duplicates.
Colin Cross25fd7732020-06-29 23:11:55 -07001407 } else {
1408 // We do have to support symlink files because the link name might be
1409 // different than the target name
1410 // (for example, Android.bp -> build/soong/root.bp)
1411 subfiles = append(subfiles, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001412 }
Colin Cross25fd7732020-06-29 23:11:55 -07001413 } else if child.IsDir() {
1414 subdirs = append(subdirs, child.Name())
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001415 } else {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001416 subfiles = append(subfiles, child.Name())
1417 }
1418
1419 }
1420 parentNode := dir
1421
1422 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1423 f.pruneCacheCandidates(entry)
1424
1425 // create a pathMap node for each relevant subdirectory
1426 relevantChildren := map[string]*pathMap{}
1427 for _, subdirName := range entry.DirNames {
1428 childNode, found := parentNode.children[subdirName]
1429 // if we already knew of this directory, then we already have a request pending to Stat it
1430 // if we didn't already know of this directory, then we must Stat it now
1431 if !found {
1432 childNode = parentNode.newChild(subdirName)
1433 f.statDirAsync(childNode)
1434 }
1435 relevantChildren[subdirName] = childNode
1436 }
1437 // Note that in rare cases, it's possible that we're reducing the set of
1438 // children via this statement, if these are all true:
1439 // 1. we previously had a cache that knew about subdirectories of parentNode
1440 // 2. the user created a prune-file (described in pruneCacheCandidates)
1441 // inside <parentNode>, which specifies that the contents of parentNode
1442 // are to be ignored.
1443 // The fact that it's possible to remove children here means that *pathMap structs
1444 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1445 // direct pointer) until after every listDirSync completes
1446 parentNode.FileNames = entry.FileNames
1447 parentNode.children = relevantChildren
1448
1449}
1450
1451// listMatches takes a node and a function that specifies which subdirectories and
1452// files to include, and listMatches returns the matches
1453func (f *Finder) listMatches(node *pathMap,
1454 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1455 entries := DirEntries{
1456 FileNames: node.FileNames,
1457 }
1458 entries.DirNames = make([]string, 0, len(node.children))
1459 for childName := range node.children {
1460 entries.DirNames = append(entries.DirNames, childName)
1461 }
1462
1463 dirNames, fileNames := filter(entries)
1464
1465 subDirs = []*pathMap{}
1466 filePaths = make([]string, 0, len(fileNames))
1467 for _, fileName := range fileNames {
1468 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1469 }
1470 subDirs = make([]*pathMap, 0, len(dirNames))
1471 for _, childName := range dirNames {
1472 child, ok := node.children[childName]
1473 if ok {
1474 subDirs = append(subDirs, child)
1475 }
1476 }
1477
1478 return subDirs, filePaths
1479}
1480
1481// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1482func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1483 approxNumThreads int) []string {
1484
1485 if approxNumThreads < 2 {
1486 // Done spawning threads; process remaining directories
1487 return f.findInCacheSinglethreaded(node, filter)
1488 }
1489
1490 totalWork := 0
1491 for _, child := range node.children {
1492 totalWork += child.approximateNumDescendents
1493 }
1494 childrenResults := make(chan []string, len(node.children))
1495
1496 subDirs, filePaths := f.listMatches(node, filter)
1497
1498 // process child directories
1499 for _, child := range subDirs {
1500 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1501 childProcessor := func(child *pathMap) {
1502 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1503 childrenResults <- childResults
1504 }
1505 // If we're allowed to use more than 1 thread to process this directory,
1506 // then instead we use 1 thread for each subdirectory.
1507 // It would be strange to spawn threads for only some subdirectories.
1508 go childProcessor(child)
1509 }
1510
1511 // collect results
1512 for i := 0; i < len(subDirs); i++ {
1513 childResults := <-childrenResults
1514 filePaths = append(filePaths, childResults...)
1515 }
1516 close(childrenResults)
1517
1518 return filePaths
1519}
1520
1521// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1522// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1523func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1524 if node == nil {
1525 return []string{}
1526 }
1527
1528 nodes := []*pathMap{node}
1529 matches := []string{}
1530
1531 for len(nodes) > 0 {
1532 currentNode := nodes[0]
1533 nodes = nodes[1:]
1534
1535 subDirs, filePaths := f.listMatches(currentNode, filter)
1536
1537 nodes = append(nodes, subDirs...)
1538
1539 matches = append(matches, filePaths...)
1540 }
1541 return matches
1542}