Blame - finder/finder.go - SHIFTPHONES/android_build_soong

blob: 054ccd639b65d3e9c076ea70833e787a9d48d3e5 [file] [log] [blame]

Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1	// Copyright 2017 Google Inc. All rights reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	package finder
				16
				17	import (
				18	"bufio"
				19	"bytes"
				20	"encoding/json"
Jeff Gaston	d311952	2017-08-22 14:11:15 -0700	[diff] [blame]	21	"errors"
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	22	"fmt"
				23	"io"
				24	"os"
				25	"path/filepath"
				26	"runtime"
				27	"sort"
				28	"strings"
				29	"sync"
				30	"sync/atomic"
				31	"time"
				32
Colin Cross	8d6395c	2017-12-21 15:46:01 -0800	[diff] [blame]	33	"android/soong/finder/fs"
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	34	)
				35
				36	// This file provides a Finder struct that can quickly search for files satisfying
				37	// certain criteria.
				38	// This Finder gets its speed partially from parallelism and partially from caching.
				39	// If a Stat call returns the same result as last time, then it means Finder
				40	// can skip the ReadDir call for that dir.
				41
				42	// The primary data structure used by the finder is the field Finder.nodes ,
				43	// which is a tree of nodes of type *pathMap .
				44	// Each node represents a directory on disk, along with its stats, subdirectories,
				45	// and contained files.
				46
				47	// The common use case for the Finder is that the caller creates a Finder and gives
				48	// it the same query that was given to it in the previous execution.
				49	// In this situation, the major events that take place are:
				50	// 1. The Finder begins to load its db
				51	// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
				52	// Calling Stat on each of these directories is generally a large fraction of the total time
				53	// 3. The Finder begins to construct a separate tree of nodes in each of its threads
				54	// 4. The Finder merges the individual node trees into the main node tree
				55	// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
				56	// These ReadDir calls might prompt additional Stat calls, etc
				57	// 6. The Finder waits for all loading to complete
				58	// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
				59
				60	// These are the invariants regarding concurrency:
				61	// 1. The public methods of Finder are threadsafe.
				62	// The public methods are only performance-optimized for one caller at a time, however.
				63	// For the moment, multiple concurrent callers shouldn't expect any better performance than
				64	// multiple serial callers.
				65	// 2. While building the node tree, only one thread may ever access the <children> collection of a
				66	// *pathMap at once.
				67	// a) The thread that accesses the <children> collection is the thread that discovers the
				68	// children (by reading them from the cache or by having received a response to ReadDir).
				69	// 1) Consequently, the thread that discovers the children also spawns requests to stat
				70	// subdirs.
				71	// b) Consequently, while building the node tree, no thread may do a lookup of its
				72	// *pathMap via filepath because another thread may be adding children to the
				73	// <children> collection of an ancestor node. Additionally, in rare cases, another thread
				74	// may be removing children from an ancestor node if the children were only discovered to
				75	// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
				76	// 3. No query will begin to be serviced until all loading (both reading the db
				77	// and scanning the filesystem) is complete.
				78	// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
				79	// generate it, making this not a huge loss in performance.
				80	// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
				81	// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
				82
				83	// see cmd/finder.go or finder_test.go for usage examples
				84
				85	// Update versionString whenever making a backwards-incompatible change to the cache file format
				86	const versionString = "Android finder version 1"
				87
				88	// a CacheParams specifies which files and directories the user wishes be scanned and
				89	// potentially added to the cache
				90	type CacheParams struct {
				91	// WorkingDirectory is used as a base for any relative file paths given to the Finder
				92	WorkingDirectory string
				93
				94	// RootDirs are the root directories used to initiate the search
				95	RootDirs []string
				96
Lukacs T. Berki	e3487c8	2022-05-02 10:13:19 +0200	[diff] [blame]	97	// Whether symlinks are followed. If set, symlinks back to their own parent
				98	// directory don't work.
				99	FollowSymlinks bool
				100
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	101	// ExcludeDirs are directory names that if encountered are removed from the search
				102	ExcludeDirs []string
				103
				104	// PruneFiles are file names that if encountered prune their entire directory
				105	// (including siblings)
				106	PruneFiles []string
				107
				108	// IncludeFiles are file names to include as matches
				109	IncludeFiles []string
Chris Parsons	a798d96	2020-10-12 23:44:08 -0400	[diff] [blame]	110
				111	// IncludeSuffixes are filename suffixes to include as matches.
				112	IncludeSuffixes []string
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	113	}
				114
				115	// a cacheConfig stores the inputs that determine what should be included in the cache
				116	type cacheConfig struct {
				117	CacheParams
				118
				119	// FilesystemView is a unique identifier telling which parts of which file systems
				120	// are readable by the Finder. In practice its value is essentially username@hostname.
				121	// FilesystemView is set to ensure that a cache file copied to another host or
				122	// found by another user doesn't inadvertently get reused.
				123	FilesystemView string
				124	}
				125
				126	func (p *cacheConfig) Dump() ([]byte, error) {
				127	bytes, err := json.Marshal(p)
				128	return bytes, err
				129	}
				130
				131	// a cacheMetadata stores version information about the cache
				132	type cacheMetadata struct {
				133	// The Version enables the Finder to determine whether it can even parse the file
				134	// If the version changes, the entire cache file must be regenerated
				135	Version string
				136
				137	// The CacheParams enables the Finder to determine whether the parameters match
				138	// If the CacheParams change, the Finder can choose how much of the cache file to reuse
				139	// (although in practice, the Finder will probably choose to ignore the entire file anyway)
				140	Config cacheConfig
				141	}
				142
				143	type Logger interface {
				144	Output(calldepth int, s string) error
				145	}
				146
				147	// the Finder is the main struct that callers will want to use
				148	type Finder struct {
				149	// configuration
				150	DbPath string
				151	numDbLoadingThreads int
				152	numSearchingThreads int
				153	cacheMetadata cacheMetadata
				154	logger Logger
				155	filesystem fs.FileSystem
				156
				157	// temporary state
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	158	threadPool *threadPool
				159	mutex sync.Mutex
				160	fsErrs []fsErr
				161	errlock sync.Mutex
				162	shutdownWaitgroup sync.WaitGroup
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	163
				164	// non-temporary state
				165	modifiedFlag int32
				166	nodes pathMap
				167	}
				168
Jeff Gaston	d311952	2017-08-22 14:11:15 -0700	[diff] [blame]	169	var defaultNumThreads = runtime.NumCPU() * 2
				170
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	171	// New creates a new Finder for use
				172	func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	173	logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gaston	d311952	2017-08-22 14:11:15 -0700	[diff] [blame]	174	return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads)
				175	}
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	176
Jeff Gaston	d311952	2017-08-22 14:11:15 -0700	[diff] [blame]	177	// newImpl is like New but accepts more params
				178	func newImpl(cacheParams CacheParams, filesystem fs.FileSystem,
				179	logger Logger, dbPath string, numThreads int) (f *Finder, err error) {
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	180	numDbLoadingThreads := numThreads
				181	numSearchingThreads := numThreads
				182
				183	metadata := cacheMetadata{
				184	Version: versionString,
				185	Config: cacheConfig{
				186	CacheParams: cacheParams,
				187	FilesystemView: filesystem.ViewId(),
				188	},
				189	}
				190
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	191	f = &Finder{
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	192	numDbLoadingThreads: numDbLoadingThreads,
				193	numSearchingThreads: numSearchingThreads,
				194	cacheMetadata: metadata,
				195	logger: logger,
				196	filesystem: filesystem,
				197
				198	nodes: *newPathMap("/"),
				199	DbPath: dbPath,
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	200
				201	shutdownWaitgroup: sync.WaitGroup{},
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	202	}
				203
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	204	f.loadFromFilesystem()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	205
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	206	// check for any filesystem errors
				207	err = f.getErr()
				208	if err != nil {
				209	return nil, err
				210	}
				211
				212	// confirm that every path mentioned in the CacheConfig exists
				213	for _, path := range cacheParams.RootDirs {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	214	if !filepath.IsAbs(path) {
				215	path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
				216	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	217	node := f.nodes.GetNode(filepath.Clean(path), false)
				218	if node == nil \|\| node.ModTime == 0 {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	219	return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	220	}
				221	}
				222
				223	return f, nil
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	224	}
				225
				226	// FindNamed searches for every cached file
				227	func (f *Finder) FindAll() []string {
				228	return f.FindAt("/")
				229	}
				230
				231	// FindNamed searches for every cached file under <rootDir>
				232	func (f *Finder) FindAt(rootDir string) []string {
				233	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				234	return entries.DirNames, entries.FileNames
				235	}
				236	return f.FindMatching(rootDir, filter)
				237	}
				238
				239	// FindNamed searches for every cached file named <fileName>
				240	func (f *Finder) FindNamed(fileName string) []string {
				241	return f.FindNamedAt("/", fileName)
				242	}
				243
				244	// FindNamedAt searches under <rootPath> for every file named <fileName>
				245	// The reason a caller might use FindNamedAt instead of FindNamed is if they want
				246	// to limit their search to a subset of the cache
				247	func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
				248	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				249	matches := []string{}
				250	for _, foundName := range entries.FileNames {
				251	if foundName == fileName {
				252	matches = append(matches, foundName)
				253	}
				254	}
				255	return entries.DirNames, matches
				256
				257	}
				258	return f.FindMatching(rootPath, filter)
				259	}
				260
				261	// FindFirstNamed searches for every file named <fileName>
				262	// Whenever it finds a match, it stops search subdirectories
				263	func (f *Finder) FindFirstNamed(fileName string) []string {
				264	return f.FindFirstNamedAt("/", fileName)
				265	}
				266
				267	// FindFirstNamedAt searches for every file named <fileName>
				268	// Whenever it finds a match, it stops search subdirectories
				269	func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
				270	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				271	matches := []string{}
				272	for _, foundName := range entries.FileNames {
				273	if foundName == fileName {
				274	matches = append(matches, foundName)
				275	}
				276	}
				277
				278	if len(matches) > 0 {
				279	return []string{}, matches
				280	}
				281	return entries.DirNames, matches
				282	}
				283	return f.FindMatching(rootPath, filter)
				284	}
				285
				286	// FindMatching is the most general exported function for searching for files in the cache
				287	// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
				288	// in place, removing file paths and directories as desired.
				289	// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
				290	func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
				291	// set up some parameters
				292	scanStart := time.Now()
				293	var isRel bool
				294	workingDir := f.cacheMetadata.Config.WorkingDirectory
				295
				296	isRel = !filepath.IsAbs(rootPath)
				297	if isRel {
				298	rootPath = filepath.Join(workingDir, rootPath)
				299	}
				300
				301	rootPath = filepath.Clean(rootPath)
				302
				303	// ensure nothing else is using the Finder
				304	f.verbosef("FindMatching waiting for finder to be idle\n")
				305	f.lock()
				306	defer f.unlock()
				307
				308	node := f.nodes.GetNode(rootPath, false)
				309	if node == nil {
				310	f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
				311	rootPath, f.cacheMetadata.Config.CacheParams)
				312	// path is not found; don't do a search
				313	return []string{}
				314	}
				315
				316	// search for matching files
				317	f.verbosef("Finder finding %v using cache\n", rootPath)
				318	results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
				319
				320	// format and return results
				321	if isRel {
				322	for i := 0; i < len(results); i++ {
				323	results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
				324	}
				325	}
				326	sort.Strings(results)
				327	f.verbosef("Found %v files under %v in %v using cache\n",
				328	len(results), rootPath, time.Since(scanStart))
				329	return results
				330	}
				331
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	332	// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
				333	// Currently, that only entails waiting for the database dump to complete.
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	334	func (f *Finder) Shutdown() {
Colin Cross	96e5e41	2020-07-06 17:13:43 -0700	[diff] [blame]	335	f.WaitForDbDump()
				336	}
				337
				338	// WaitForDbDump returns once the database has been written to f.DbPath.
				339	func (f *Finder) WaitForDbDump() {
				340	f.shutdownWaitgroup.Wait()
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	341	}
				342
				343	// End of public api
				344
				345	func (f *Finder) goDumpDb() {
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	346	if f.wasModified() {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	347	f.shutdownWaitgroup.Add(1)
				348	go func() {
				349	err := f.dumpDb()
				350	if err != nil {
				351	f.verbosef("%v\n", err)
				352	}
				353	f.shutdownWaitgroup.Done()
				354	}()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	355	} else {
				356	f.verbosef("Skipping dumping unmodified db\n")
				357	}
				358	}
				359
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	360	// joinCleanPaths is like filepath.Join but is faster because
				361	// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
				362	func joinCleanPaths(base string, leaf string) string {
				363	if base == "" {
				364	return leaf
				365	}
				366	if base == "/" {
				367	return base + leaf
				368	}
				369	if leaf == "" {
				370	return base
				371	}
				372	return base + "/" + leaf
				373	}
				374
				375	func (f *Finder) verbosef(format string, args ...interface{}) {
				376	f.logger.Output(2, fmt.Sprintf(format, args...))
				377	}
				378
				379	// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
				380	func (f *Finder) loadFromFilesystem() {
				381	f.threadPool = newThreadPool(f.numDbLoadingThreads)
				382
				383	err := f.startFromExternalCache()
				384	if err != nil {
				385	f.startWithoutExternalCache()
				386	}
				387
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	388	f.goDumpDb()
				389
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	390	f.threadPool = nil
				391	}
				392
				393	func (f *Finder) startFind(path string) {
				394	if !filepath.IsAbs(path) {
				395	path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
				396	}
				397	node := f.nodes.GetNode(path, true)
				398	f.statDirAsync(node)
				399	}
				400
				401	func (f *Finder) lock() {
				402	f.mutex.Lock()
				403	}
				404
				405	func (f *Finder) unlock() {
				406	f.mutex.Unlock()
				407	}
				408
				409	// a statResponse is the relevant portion of the response from the filesystem to a Stat call
				410	type statResponse struct {
				411	ModTime int64
				412	Inode uint64
				413	Device uint64
				414	}
				415
				416	// a pathAndStats stores a path and its stats
				417	type pathAndStats struct {
				418	statResponse
				419
				420	Path string
				421	}
				422
				423	// a dirFullInfo stores all of the relevant information we know about a directory
				424	type dirFullInfo struct {
				425	pathAndStats
				426
				427	FileNames []string
				428	}
				429
				430	// a PersistedDirInfo is the information about a dir that we save to our cache on disk
				431	type PersistedDirInfo struct {
				432	// These field names are short because they are repeated many times in the output json file
				433	P string // path
				434	T int64 // modification time
				435	I uint64 // inode number
				436	F []string // relevant filenames contained
				437	}
				438
				439	// a PersistedDirs is the information that we persist for a group of dirs
				440	type PersistedDirs struct {
				441	// the device on which each directory is stored
				442	Device uint64
				443	// the common root path to which all contained dirs are relative
				444	Root string
				445	// the directories themselves
				446	Dirs []PersistedDirInfo
				447	}
				448
				449	// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
				450	type CacheEntry []PersistedDirs
				451
				452	// a DirEntries lists the files and directories contained directly within a specific directory
				453	type DirEntries struct {
				454	Path string
				455
				456	// elements of DirNames are just the dir names; they don't include any '/' character
				457	DirNames []string
				458	// elements of FileNames are just the file names; they don't include '/' character
				459	FileNames []string
				460	}
				461
				462	// a WalkFunc is the type that is passed into various Find functions for determining which
				463	// directories the caller wishes be walked. The WalkFunc is expected to decide which
				464	// directories to walk and which files to consider as matches to the original query.
				465	type WalkFunc func(DirEntries) (dirs []string, files []string)
				466
				467	// a mapNode stores the relevant stats about a directory to be stored in a pathMap
				468	type mapNode struct {
				469	statResponse
				470	FileNames []string
				471	}
				472
				473	// a pathMap implements the directory tree structure of nodes
				474	type pathMap struct {
				475	mapNode
				476
				477	path string
				478
				479	children map[string]*pathMap
				480
				481	// number of descendent nodes, including self
				482	approximateNumDescendents int
				483	}
				484
				485	func newPathMap(path string) *pathMap {
				486	result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
				487	approximateNumDescendents: 1}
				488	return result
				489	}
				490
				491	// GetNode returns the node at <path>
				492	func (m pathMap) GetNode(path string, createIfNotFound bool) pathMap {
				493	if len(path) > 0 && path[0] == '/' {
				494	path = path[1:]
				495	}
				496
				497	node := m
				498	for {
				499	if path == "" {
				500	return node
				501	}
				502
				503	index := strings.Index(path, "/")
				504	var firstComponent string
				505	if index >= 0 {
				506	firstComponent = path[:index]
				507	path = path[index+1:]
				508	} else {
				509	firstComponent = path
				510	path = ""
				511	}
				512
				513	child, found := node.children[firstComponent]
				514
				515	if !found {
				516	if createIfNotFound {
				517	child = node.newChild(firstComponent)
				518	} else {
				519	return nil
				520	}
				521	}
				522
				523	node = child
				524	}
				525	}
				526
				527	func (m pathMap) newChild(name string) (child pathMap) {
				528	path := joinCleanPaths(m.path, name)
				529	newChild := newPathMap(path)
				530	m.children[name] = newChild
				531
				532	return m.children[name]
				533	}
				534
				535	func (m *pathMap) UpdateNumDescendents() int {
				536	count := 1
				537	for _, child := range m.children {
				538	count += child.approximateNumDescendents
				539	}
				540	m.approximateNumDescendents = count
				541	return count
				542	}
				543
				544	func (m *pathMap) UpdateNumDescendentsRecursive() {
				545	for _, child := range m.children {
				546	child.UpdateNumDescendentsRecursive()
				547	}
				548	m.UpdateNumDescendents()
				549	}
				550
				551	func (m pathMap) MergeIn(other pathMap) {
				552	for key, theirs := range other.children {
				553	ours, found := m.children[key]
				554	if found {
				555	ours.MergeIn(theirs)
				556	} else {
				557	m.children[key] = theirs
				558	}
				559	}
				560	if other.ModTime != 0 {
				561	m.mapNode = other.mapNode
				562	}
				563	m.UpdateNumDescendents()
				564	}
				565
				566	func (m *pathMap) DumpAll() []dirFullInfo {
				567	results := []dirFullInfo{}
				568	m.dumpInto("", &results)
				569	return results
				570	}
				571
				572	func (m pathMap) dumpInto(path string, results []dirFullInfo) {
				573	results = append(results,
				574	dirFullInfo{
				575	pathAndStats{statResponse: m.statResponse, Path: path},
				576	m.FileNames},
				577	)
				578	for key, child := range m.children {
				579	childPath := joinCleanPaths(path, key)
				580	if len(childPath) == 0 \|\| childPath[0] != '/' {
				581	childPath = "/" + childPath
				582	}
				583	child.dumpInto(childPath, results)
				584	}
				585	}
				586
				587	// a semaphore can be locked by up to <capacity> callers at once
				588	type semaphore struct {
				589	pool chan bool
				590	}
				591
				592	func newSemaphore(capacity int) *semaphore {
				593	return &semaphore{pool: make(chan bool, capacity)}
				594	}
				595
				596	func (l *semaphore) Lock() {
				597	l.pool <- true
				598	}
				599
				600	func (l *semaphore) Unlock() {
				601	<-l.pool
				602	}
				603
				604	// A threadPool runs goroutines and supports throttling and waiting.
				605	// Without throttling, Go may exhaust the maximum number of various resources, such as
				606	// threads or file descriptors, and crash the program.
				607	type threadPool struct {
				608	receivedRequests sync.WaitGroup
				609	activeRequests semaphore
				610	}
				611
				612	func newThreadPool(maxNumConcurrentThreads int) *threadPool {
				613	return &threadPool{
				614	receivedRequests: sync.WaitGroup{},
				615	activeRequests: *newSemaphore(maxNumConcurrentThreads),
				616	}
				617	}
				618
				619	// Run requests to run the given function in its own goroutine
				620	func (p *threadPool) Run(function func()) {
				621	p.receivedRequests.Add(1)
				622	// If Run() was called from within a goroutine spawned by this threadPool,
				623	// then we may need to return from Run() before having capacity to actually
				624	// run <function>.
				625	//
				626	// It's possible that the body of <function> contains a statement (such as a syscall)
				627	// that will cause Go to pin it to a thread, or will contain a statement that uses
				628	// another resource that is in short supply (such as a file descriptor), so we can't
				629	// actually run <function> until we have capacity.
				630	//
				631	// However, the semaphore used for synchronization is implemented via a channel and
				632	// shouldn't require a new thread for each access.
				633	go func() {
				634	p.activeRequests.Lock()
				635	function()
				636	p.activeRequests.Unlock()
				637	p.receivedRequests.Done()
				638	}()
				639	}
				640
				641	// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
				642	func (p *threadPool) Wait() {
				643	p.receivedRequests.Wait()
				644	}
				645
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	646	type fsErr struct {
				647	path string
				648	err error
				649	}
				650
				651	func (e fsErr) String() string {
				652	return e.path + ": " + e.err.Error()
				653	}
				654
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	655	func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
				656	// group each dirFullInfo by its Device, to avoid having to repeat it in the output
				657	dirsByDevice := map[uint64][]PersistedDirInfo{}
				658	for _, entry := range dirInfos {
				659	_, found := dirsByDevice[entry.Device]
				660	if !found {
				661	dirsByDevice[entry.Device] = []PersistedDirInfo{}
				662	}
				663	dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
				664	PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
				665	}
				666
				667	cacheEntry := CacheEntry{}
				668
				669	for device, infos := range dirsByDevice {
				670	// find common prefix
				671	prefix := ""
				672	if len(infos) > 0 {
				673	prefix = infos[0].P
				674	}
				675	for _, info := range infos {
				676	for !strings.HasPrefix(info.P+"/", prefix+"/") {
				677	prefix = filepath.Dir(prefix)
Jeff Gaston	996716a	2017-08-22 13:33:19 -0700	[diff] [blame]	678	if prefix == "/" {
				679	break
				680	}
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	681	}
				682	}
				683	// remove common prefix
				684	for i := range infos {
				685	suffix := strings.Replace(infos[i].P, prefix, "", 1)
				686	if len(suffix) > 0 && suffix[0] == '/' {
				687	suffix = suffix[1:]
				688	}
				689	infos[i].P = suffix
				690	}
				691
				692	// turn the map (keyed by device) into a list of structs with labeled fields
				693	// this is to improve readability of the output
				694	cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
				695	}
				696
				697	// convert to json.
				698	// it would save some space to use a different format than json for the db file,
				699	// but the space and time savings are small, and json is easy for humans to read
				700	bytes, err := json.Marshal(cacheEntry)
				701	return bytes, err
				702	}
				703
				704	func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
				705	var cacheEntry CacheEntry
				706	err := json.Unmarshal(bytes, &cacheEntry)
				707	if err != nil {
				708	return nil, err
				709	}
				710
				711	// convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
				712	capacity := 0
				713	for _, element := range cacheEntry {
				714	capacity += len(element.Dirs)
				715	}
				716	nodes := make([]dirFullInfo, capacity)
				717	count := 0
				718	for _, element := range cacheEntry {
				719	for _, dir := range element.Dirs {
				720	path := joinCleanPaths(element.Root, dir.P)
				721
				722	nodes[count] = dirFullInfo{
				723	pathAndStats: pathAndStats{
				724	statResponse: statResponse{
				725	ModTime: dir.T, Inode: dir.I, Device: element.Device,
				726	},
				727	Path: path},
				728	FileNames: dir.F}
				729	count++
				730	}
				731	}
				732	return nodes, nil
				733	}
				734
				735	// We use the following separator byte to distinguish individually parseable blocks of json
				736	// because we know this separator won't appear in the json that we're parsing.
				737	//
				738	// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
Colin Cross	d079e0b	2022-08-16 10:27:33 -0700	[diff] [blame]	739	// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
				740	// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
				741	// character.
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	742	//
				743	// We know that the newline character will never appear in our json string, because:
Colin Cross	d079e0b	2022-08-16 10:27:33 -0700	[diff] [blame]	744	// - If a newline character appears as part of a data string, then json encoding will
				745	// emit two characters instead: '\' and 'n'.
				746	// - The json encoder that we use doesn't emit the optional newlines between any of its
				747	// other outputs.
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	748	const lineSeparator = byte('\n')
				749
				750	func (f Finder) readLine(reader bufio.Reader) ([]byte, error) {
				751	return reader.ReadBytes(lineSeparator)
				752	}
				753
				754	// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
				755	func (f Finder) validateCacheHeader(cacheReader bufio.Reader) bool {
				756	cacheVersionBytes, err := f.readLine(cacheReader)
				757	if err != nil {
				758	f.verbosef("Failed to read database header; database is invalid\n")
				759	return false
				760	}
				761	if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
				762	cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
				763	}
				764	cacheVersionString := string(cacheVersionBytes)
				765	currentVersion := f.cacheMetadata.Version
				766	if cacheVersionString != currentVersion {
				767	f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
				768	return false
				769	}
				770
				771	cacheParamBytes, err := f.readLine(cacheReader)
				772	if err != nil {
				773	f.verbosef("Failed to read database search params; database is invalid\n")
				774	return false
				775	}
				776
				777	if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
				778	cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
				779	}
				780
				781	currentParamBytes, err := f.cacheMetadata.Config.Dump()
				782	if err != nil {
				783	panic("Finder failed to serialize its parameters")
				784	}
				785	cacheParamString := string(cacheParamBytes)
				786	currentParamString := string(currentParamBytes)
				787	if cacheParamString != currentParamString {
				788	f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
				789	return false
				790	}
				791	return true
				792	}
				793
				794	// loadBytes compares the cache info in <data> to the state of the filesystem
				795	// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
				796	func (f Finder) loadBytes(id int, data []byte) (m pathMap, dirsToWalk []string, err error) {
				797
				798	helperStartTime := time.Now()
				799
				800	cachedNodes, err := f.parseCacheEntry(data)
				801	if err != nil {
				802	return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
				803	}
				804
				805	unmarshalDate := time.Now()
				806	f.verbosef("Unmarshaled %v objects for %v in %v\n",
				807	len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
				808
				809	tempMap := newPathMap("/")
				810	stats := make([]statResponse, len(cachedNodes))
				811
				812	for i, node := range cachedNodes {
				813	// check the file system for an updated timestamp
				814	stats[i] = f.statDirSync(node.Path)
				815	}
				816
				817	dirsToWalk = []string{}
				818	for i, cachedNode := range cachedNodes {
				819	updated := stats[i]
				820	// save the cached value
				821	container := tempMap.GetNode(cachedNode.Path, true)
				822	container.mapNode = mapNode{statResponse: updated}
				823
				824	// if the metadata changed and the directory still exists, then
				825	// make a note to walk it later
				826	if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
				827	f.setModified()
				828	// make a note that the directory needs to be walked
				829	dirsToWalk = append(dirsToWalk, cachedNode.Path)
				830	} else {
				831	container.mapNode.FileNames = cachedNode.FileNames
				832	}
				833	}
				834	// count the number of nodes to improve our understanding of the shape of the tree,
				835	// thereby improving parallelism of subsequent searches
				836	tempMap.UpdateNumDescendentsRecursive()
				837
				838	f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
				839	return tempMap, dirsToWalk, nil
				840	}
				841
				842	// startFromExternalCache loads the cache database from disk
				843	// startFromExternalCache waits to return until the load of the cache db is complete, but
				844	// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
				845	func (f *Finder) startFromExternalCache() (err error) {
				846	startTime := time.Now()
				847	dbPath := f.DbPath
				848
				849	// open cache file and validate its header
				850	reader, err := f.filesystem.Open(dbPath)
				851	if err != nil {
				852	return errors.New("No data to load from database\n")
				853	}
Liz Kammer	7fe2400	2022-02-07 13:18:15 -0500	[diff] [blame]	854	defer reader.Close()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	855	bufferedReader := bufio.NewReader(reader)
				856	if !f.validateCacheHeader(bufferedReader) {
				857	return errors.New("Cache header does not match")
				858	}
				859	f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
				860
				861	// read the file and spawn threads to process it
				862	nodesToWalk := [][]*pathMap{}
				863	mainTree := newPathMap("/")
				864
				865	// read the blocks and stream them into <blockChannel>
				866	type dataBlock struct {
				867	id int
				868	err error
				869	data []byte
				870	}
				871	blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
				872	readBlocks := func() {
				873	index := 0
				874	for {
				875	// It takes some time to unmarshal the input from json, so we want
				876	// to unmarshal it in parallel. In order to find valid places to
				877	// break the input, we scan for the line separators that we inserted
				878	// (for this purpose) when we dumped the database.
				879	data, err := f.readLine(bufferedReader)
				880	var response dataBlock
				881	done := false
				882	if err != nil && err != io.EOF {
				883	response = dataBlock{id: index, err: err, data: nil}
				884	done = true
				885	} else {
				886	done = (err == io.EOF)
				887	response = dataBlock{id: index, err: nil, data: data}
				888	}
				889	blockChannel <- response
				890	index++
				891	duration := time.Since(startTime)
				892	f.verbosef("Read block %v after %v\n", index, duration)
				893	if done {
				894	f.verbosef("Read %v blocks in %v\n", index, duration)
				895	close(blockChannel)
				896	return
				897	}
				898	}
				899	}
				900	go readBlocks()
				901
				902	// Read from <blockChannel> and stream the responses into <resultChannel>.
				903	type workResponse struct {
				904	id int
				905	err error
				906	tree *pathMap
				907	updatedDirs []string
				908	}
				909	resultChannel := make(chan workResponse)
				910	processBlocks := func() {
				911	numProcessed := 0
				912	threadPool := newThreadPool(f.numDbLoadingThreads)
				913	for {
				914	// get a block to process
				915	block, received := <-blockChannel
				916	if !received {
				917	break
				918	}
				919
				920	if block.err != nil {
				921	resultChannel <- workResponse{err: block.err}
				922	break
				923	}
				924	numProcessed++
				925	// wait until there is CPU available to process it
				926	threadPool.Run(
				927	func() {
				928	processStartTime := time.Now()
				929	f.verbosef("Starting to process block %v after %v\n",
				930	block.id, processStartTime.Sub(startTime))
				931	tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
				932	var response workResponse
				933	if err != nil {
				934	f.verbosef(
				935	"Block %v failed to parse with error %v\n",
				936	block.id, err)
				937	response = workResponse{err: err}
				938	} else {
				939	response = workResponse{
				940	id: block.id,
				941	err: nil,
				942	tree: tempMap,
				943	updatedDirs: updatedDirs,
				944	}
				945	}
				946	f.verbosef("Processed block %v in %v\n",
				947	block.id, time.Since(processStartTime),
				948	)
				949	resultChannel <- response
				950	},
				951	)
				952	}
				953	threadPool.Wait()
				954	f.verbosef("Finished processing %v blocks in %v\n",
				955	numProcessed, time.Since(startTime))
				956	close(resultChannel)
				957	}
				958	go processBlocks()
				959
				960	// Read from <resultChannel> and use the results
				961	combineResults := func() (err error) {
				962	for {
				963	result, received := <-resultChannel
				964	if !received {
				965	break
				966	}
				967	if err != nil {
				968	// In case of an error, wait for work to complete before
				969	// returning the error. This ensures that any subsequent
				970	// work doesn't need to compete for resources (and possibly
				971	// fail due to, for example, a filesystem limit on the number of
				972	// concurrently open files) with past work.
				973	continue
				974	}
				975	if result.err != nil {
				976	err = result.err
				977	continue
				978	}
				979	// update main tree
				980	mainTree.MergeIn(result.tree)
				981	// record any new directories that we will need to Stat()
				982	updatedNodes := make([]*pathMap, len(result.updatedDirs))
				983	for j, dir := range result.updatedDirs {
				984	node := mainTree.GetNode(dir, false)
				985	updatedNodes[j] = node
				986	}
				987	nodesToWalk = append(nodesToWalk, updatedNodes)
				988	}
				989	return err
				990	}
				991	err = combineResults()
				992	if err != nil {
				993	return err
				994	}
				995
				996	f.nodes = *mainTree
				997
				998	// after having loaded the entire db and therefore created entries for
				999	// the directories we know of, now it's safe to start calling ReadDir on
				1000	// any updated directories
				1001	for i := range nodesToWalk {
				1002	f.listDirsAsync(nodesToWalk[i])
				1003	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1004	f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
				1005	f.threadPool.Wait()
				1006	f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
				1007
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1008	return err
				1009	}
				1010
				1011	// startWithoutExternalCache starts scanning the filesystem according to the cache config
				1012	// startWithoutExternalCache should be called if startFromExternalCache is not applicable
				1013	func (f *Finder) startWithoutExternalCache() {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1014	startTime := time.Now()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1015	configDirs := f.cacheMetadata.Config.RootDirs
				1016
				1017	// clean paths
				1018	candidates := make([]string, len(configDirs))
				1019	for i, dir := range configDirs {
				1020	candidates[i] = filepath.Clean(dir)
				1021	}
				1022	// remove duplicates
				1023	dirsToScan := make([]string, 0, len(configDirs))
				1024	for _, candidate := range candidates {
				1025	include := true
				1026	for _, included := range dirsToScan {
				1027	if included == "/" \|\| strings.HasPrefix(candidate+"/", included+"/") {
				1028	include = false
				1029	break
				1030	}
				1031	}
				1032	if include {
				1033	dirsToScan = append(dirsToScan, candidate)
				1034	}
				1035	}
				1036
				1037	// start searching finally
				1038	for _, path := range dirsToScan {
				1039	f.verbosef("Starting find of %v\n", path)
				1040	f.startFind(path)
				1041	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1042
				1043	f.threadPool.Wait()
				1044
				1045	f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1046	}
				1047
				1048	// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
				1049	func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
				1050	if old.Inode != new.Inode {
				1051	return false
				1052	}
				1053	if old.ModTime != new.ModTime {
				1054	return false
				1055	}
				1056	if old.Device != new.Device {
				1057	return false
				1058	}
				1059	return true
				1060	}
				1061
				1062	func (f *Finder) wasModified() bool {
				1063	return atomic.LoadInt32(&f.modifiedFlag) > 0
				1064	}
				1065
				1066	func (f *Finder) setModified() {
				1067	var newVal int32
				1068	newVal = 1
				1069	atomic.StoreInt32(&f.modifiedFlag, newVal)
				1070	}
				1071
				1072	// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
				1073	func (f *Finder) sortedDirEntries() []dirFullInfo {
				1074	startTime := time.Now()
				1075	nodes := make([]dirFullInfo, 0)
				1076	for _, node := range f.nodes.DumpAll() {
				1077	if node.ModTime != 0 {
				1078	nodes = append(nodes, node)
				1079	}
				1080	}
				1081	discoveryDate := time.Now()
				1082	f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
				1083	less := func(i int, j int) bool {
				1084	return nodes[i].Path < nodes[j].Path
				1085	}
				1086	sort.Slice(nodes, less)
				1087	sortDate := time.Now()
				1088	f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
				1089
				1090	return nodes
				1091	}
				1092
				1093	// serializeDb converts the cache database into a form to save to disk
				1094	func (f *Finder) serializeDb() ([]byte, error) {
				1095	// sort dir entries
				1096	var entryList = f.sortedDirEntries()
				1097
				1098	// Generate an output file that can be conveniently loaded using the same number of threads
				1099	// as were used in this execution (because presumably that will be the number of threads
				1100	// used in the next execution too)
				1101
				1102	// generate header
				1103	header := []byte{}
				1104	header = append(header, []byte(f.cacheMetadata.Version)...)
				1105	header = append(header, lineSeparator)
				1106	configDump, err := f.cacheMetadata.Config.Dump()
				1107	if err != nil {
				1108	return nil, err
				1109	}
				1110	header = append(header, configDump...)
				1111
				1112	// serialize individual blocks in parallel
				1113	numBlocks := f.numDbLoadingThreads
				1114	if numBlocks > len(entryList) {
				1115	numBlocks = len(entryList)
				1116	}
				1117	blocks := make([][]byte, 1+numBlocks)
				1118	blocks[0] = header
				1119	blockMin := 0
				1120	wg := sync.WaitGroup{}
				1121	var errLock sync.Mutex
				1122
				1123	for i := 1; i <= numBlocks; i++ {
				1124	// identify next block
				1125	blockMax := len(entryList) * i / numBlocks
				1126	block := entryList[blockMin:blockMax]
				1127
				1128	// process block
				1129	wg.Add(1)
				1130	go func(index int, block []dirFullInfo) {
				1131	byteBlock, subErr := f.serializeCacheEntry(block)
				1132	f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
				1133	if subErr != nil {
				1134	f.verbosef("%v\n", subErr.Error())
				1135	errLock.Lock()
				1136	err = subErr
				1137	errLock.Unlock()
				1138	} else {
				1139	blocks[index] = byteBlock
				1140	}
				1141	wg.Done()
				1142	}(i, block)
				1143
				1144	blockMin = blockMax
				1145	}
				1146
				1147	wg.Wait()
				1148
				1149	if err != nil {
				1150	return nil, err
				1151	}
				1152
				1153	content := bytes.Join(blocks, []byte{lineSeparator})
				1154
				1155	return content, nil
				1156	}
				1157
				1158	// dumpDb saves the cache database to disk
				1159	func (f *Finder) dumpDb() error {
				1160	startTime := time.Now()
				1161	f.verbosef("Dumping db\n")
				1162
				1163	tempPath := f.DbPath + ".tmp"
				1164
				1165	bytes, err := f.serializeDb()
				1166	if err != nil {
				1167	return err
				1168	}
				1169	serializeDate := time.Now()
				1170	f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
				1171	// dump file and atomically move
				1172	err = f.filesystem.WriteFile(tempPath, bytes, 0777)
				1173	if err != nil {
				1174	return err
				1175	}
				1176	err = f.filesystem.Rename(tempPath, f.DbPath)
				1177	if err != nil {
				1178	return err
				1179	}
				1180
				1181	f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
				1182	return nil
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1183
				1184	}
				1185
				1186	// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
				1187	func (f *Finder) canIgnoreFsErr(err error) bool {
				1188	pathErr, isPathErr := err.(*os.PathError)
				1189	if !isPathErr {
				1190	// Don't recognize this error
				1191	return false
				1192	}
Jeff Gaston	821271b	2017-08-23 17:53:38 -0700	[diff] [blame]	1193	if os.IsPermission(pathErr) {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1194	// Permission errors are ignored:
				1195	// https://issuetracker.google.com/37553659
				1196	// https://github.com/google/kati/pull/116
				1197	return true
				1198	}
				1199	if pathErr.Err == os.ErrNotExist {
				1200	// If a directory doesn't exist, that generally means the cache is out-of-date
				1201	return true
				1202	}
				1203	// Don't recognize this error
				1204	return false
				1205	}
				1206
				1207	// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
				1208	func (f *Finder) onFsError(path string, err error) {
				1209	if !f.canIgnoreFsErr(err) {
				1210	// We could send the errors through a channel instead, although that would cause this call
				1211	// to block unless we preallocated a sufficient buffer or spawned a reader thread.
				1212	// Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
				1213	// more convenient to use a lock. Only in an unusual situation should this code be
				1214	// invoked anyway.
				1215	f.errlock.Lock()
				1216	f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
				1217	f.errlock.Unlock()
				1218	}
				1219	}
				1220
				1221	// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
				1222	func (f *Finder) discardErrsForPrunedPaths() {
				1223	// This function could be somewhat inefficient due to being single-threaded,
				1224	// but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
				1225	relevantErrs := make([]fsErr, 0, len(f.fsErrs))
				1226	for _, fsErr := range f.fsErrs {
				1227	path := fsErr.path
				1228	node := f.nodes.GetNode(path, false)
				1229	if node != nil {
				1230	// The path in question wasn't pruned due to a failure to process a parent directory.
				1231	// So, the failure to process this path is important
				1232	relevantErrs = append(relevantErrs, fsErr)
				1233	}
				1234	}
				1235	f.fsErrs = relevantErrs
				1236	}
				1237
				1238	// getErr returns an error based on previous calls to onFsErr, if any
				1239	func (f *Finder) getErr() error {
				1240	f.discardErrsForPrunedPaths()
				1241
				1242	numErrs := len(f.fsErrs)
				1243	if numErrs < 1 {
				1244	return nil
				1245	}
				1246
				1247	maxNumErrsToInclude := 10
				1248	message := ""
				1249	if numErrs > maxNumErrsToInclude {
				1250	message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
				1251	} else {
				1252	message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
				1253	}
				1254
				1255	return errors.New(message)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1256	}
				1257
				1258	func (f Finder) statDirAsync(dir pathMap) {
				1259	node := dir
				1260	path := dir.path
				1261	f.threadPool.Run(
				1262	func() {
				1263	updatedStats := f.statDirSync(path)
				1264
				1265	if !f.isInfoUpToDate(node.statResponse, updatedStats) {
				1266	node.mapNode = mapNode{
				1267	statResponse: updatedStats,
				1268	FileNames: []string{},
				1269	}
				1270	f.setModified()
				1271	if node.statResponse.ModTime != 0 {
				1272	// modification time was updated, so re-scan for
				1273	// child directories
				1274	f.listDirAsync(dir)
				1275	}
				1276	}
				1277	},
				1278	)
				1279	}
				1280
				1281	func (f *Finder) statDirSync(path string) statResponse {
				1282
				1283	fileInfo, err := f.filesystem.Lstat(path)
				1284
				1285	var stats statResponse
				1286	if err != nil {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1287	// possibly record this error
				1288	f.onFsError(path, err)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1289	// in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
				1290	return stats
				1291	}
				1292	modTime := fileInfo.ModTime()
				1293	stats = statResponse{}
				1294	inode, err := f.filesystem.InodeNumber(fileInfo)
				1295	if err != nil {
				1296	panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
				1297	}
				1298	stats.Inode = inode
				1299	device, err := f.filesystem.DeviceNumber(fileInfo)
				1300	if err != nil {
				1301	panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
				1302	}
				1303	stats.Device = device
				1304	permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
				1305
				1306	if err != nil {
				1307	panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
				1308	}
				1309	// We're only interested in knowing whether anything about the directory
				1310	// has changed since last check, so we use the latest of the two
				1311	// modification times (content modification (mtime) and
				1312	// permission modification (ctime))
				1313	if permissionsChangeTime.After(modTime) {
				1314	modTime = permissionsChangeTime
				1315	}
				1316	stats.ModTime = modTime.UnixNano()
				1317
				1318	return stats
				1319	}
				1320
Chris Parsons	a798d96	2020-10-12 23:44:08 -0400	[diff] [blame]	1321	func (f *Finder) shouldIncludeFile(fileName string) bool {
				1322	for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
				1323	if fileName == includedName {
				1324	return true
				1325	}
				1326	}
				1327	for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes {
				1328	if strings.HasSuffix(fileName, includeSuffix) {
				1329	return true
				1330	}
				1331	}
				1332	return false
				1333	}
				1334
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1335	// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
				1336	func (f Finder) pruneCacheCandidates(items DirEntries) {
				1337
				1338	for _, fileName := range items.FileNames {
				1339	for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
				1340	if fileName == abortedName {
				1341	items.FileNames = []string{}
				1342	items.DirNames = []string{}
				1343	return
				1344	}
				1345	}
				1346	}
				1347
				1348	// remove any files that aren't the ones we want to include
				1349	writeIndex := 0
				1350	for _, fileName := range items.FileNames {
Chris Parsons	a798d96	2020-10-12 23:44:08 -0400	[diff] [blame]	1351	if f.shouldIncludeFile(fileName) {
				1352	items.FileNames[writeIndex] = fileName
				1353	writeIndex++
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1354	}
				1355	}
				1356	// resize
				1357	items.FileNames = items.FileNames[:writeIndex]
				1358
				1359	writeIndex = 0
				1360	for _, dirName := range items.DirNames {
				1361	items.DirNames[writeIndex] = dirName
				1362	// ignore other dirs that are known to not be inputs to the build process
				1363	include := true
				1364	for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
				1365	if dirName == excludedName {
				1366	// don't include
				1367	include = false
				1368	break
				1369	}
				1370	}
				1371	if include {
				1372	writeIndex++
				1373	}
				1374	}
				1375	// resize
				1376	items.DirNames = items.DirNames[:writeIndex]
				1377	}
				1378
				1379	func (f Finder) listDirsAsync(nodes []pathMap) {
				1380	f.threadPool.Run(
				1381	func() {
				1382	for i := range nodes {
				1383	f.listDirSync(nodes[i])
				1384	}
				1385	},
				1386	)
				1387	}
				1388
				1389	func (f Finder) listDirAsync(node pathMap) {
				1390	f.threadPool.Run(
				1391	func() {
				1392	f.listDirSync(node)
				1393	},
				1394	)
				1395	}
				1396
				1397	func (f Finder) listDirSync(dir pathMap) {
				1398	path := dir.path
				1399	children, err := f.filesystem.ReadDir(path)
				1400
				1401	if err != nil {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1402	// possibly record this error
				1403	f.onFsError(path, err)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1404	// if listing the contents of the directory fails (presumably due to
				1405	// permission denied), then treat the directory as empty
Colin Cross	a88c883	2017-12-21 15:39:26 -0800	[diff] [blame]	1406	children = nil
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1407	}
				1408
				1409	var subdirs []string
				1410	var subfiles []string
				1411
				1412	for _, child := range children {
				1413	linkBits := child.Mode() & os.ModeSymlink
				1414	isLink := linkBits != 0
Colin Cross	25fd773	2020-06-29 23:11:55 -0700	[diff] [blame]	1415	if isLink {
				1416	childPath := filepath.Join(path, child.Name())
				1417	childStat, err := f.filesystem.Stat(childPath)
				1418	if err != nil {
				1419	// If stat fails this is probably a broken or dangling symlink, treat it as a file.
				1420	subfiles = append(subfiles, child.Name())
				1421	} else if childStat.IsDir() {
Lukacs T. Berki	e3487c8	2022-05-02 10:13:19 +0200	[diff] [blame]	1422	// Skip symlink dirs if not requested otherwise. Android has a number
				1423	// of symlinks creating infinite source trees which would otherwise get
				1424	// us in an infinite loop.
				1425	// TODO(b/197349722): Revisit this once symlink loops are banned in the
				1426	// source tree.
				1427	if f.cacheMetadata.Config.FollowSymlinks {
				1428	subdirs = append(subdirs, child.Name())
				1429	}
Colin Cross	25fd773	2020-06-29 23:11:55 -0700	[diff] [blame]	1430	} else {
				1431	// We do have to support symlink files because the link name might be
				1432	// different than the target name
				1433	// (for example, Android.bp -> build/soong/root.bp)
				1434	subfiles = append(subfiles, child.Name())
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1435	}
Colin Cross	25fd773	2020-06-29 23:11:55 -0700	[diff] [blame]	1436	} else if child.IsDir() {
				1437	subdirs = append(subdirs, child.Name())
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1438	} else {
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1439	subfiles = append(subfiles, child.Name())
				1440	}
				1441
				1442	}
				1443	parentNode := dir
				1444
				1445	entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
				1446	f.pruneCacheCandidates(entry)
				1447
				1448	// create a pathMap node for each relevant subdirectory
				1449	relevantChildren := map[string]*pathMap{}
				1450	for _, subdirName := range entry.DirNames {
				1451	childNode, found := parentNode.children[subdirName]
				1452	// if we already knew of this directory, then we already have a request pending to Stat it
				1453	// if we didn't already know of this directory, then we must Stat it now
				1454	if !found {
				1455	childNode = parentNode.newChild(subdirName)
				1456	f.statDirAsync(childNode)
				1457	}
				1458	relevantChildren[subdirName] = childNode
				1459	}
				1460	// Note that in rare cases, it's possible that we're reducing the set of
				1461	// children via this statement, if these are all true:
				1462	// 1. we previously had a cache that knew about subdirectories of parentNode
				1463	// 2. the user created a prune-file (described in pruneCacheCandidates)
				1464	// inside <parentNode>, which specifies that the contents of parentNode
				1465	// are to be ignored.
				1466	// The fact that it's possible to remove children here means that *pathMap structs
				1467	// must not be looked up from f.nodes by filepath (and instead must be accessed by
				1468	// direct pointer) until after every listDirSync completes
				1469	parentNode.FileNames = entry.FileNames
				1470	parentNode.children = relevantChildren
				1471
				1472	}
				1473
				1474	// listMatches takes a node and a function that specifies which subdirectories and
				1475	// files to include, and listMatches returns the matches
				1476	func (f Finder) listMatches(node pathMap,
				1477	filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
				1478	entries := DirEntries{
				1479	FileNames: node.FileNames,
				1480	}
				1481	entries.DirNames = make([]string, 0, len(node.children))
				1482	for childName := range node.children {
				1483	entries.DirNames = append(entries.DirNames, childName)
				1484	}
				1485
				1486	dirNames, fileNames := filter(entries)
				1487
				1488	subDirs = []*pathMap{}
				1489	filePaths = make([]string, 0, len(fileNames))
				1490	for _, fileName := range fileNames {
				1491	filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
				1492	}
				1493	subDirs = make([]*pathMap, 0, len(dirNames))
				1494	for _, childName := range dirNames {
				1495	child, ok := node.children[childName]
				1496	if ok {
				1497	subDirs = append(subDirs, child)
				1498	}
				1499	}
				1500
				1501	return subDirs, filePaths
				1502	}
				1503
				1504	// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
				1505	func (f Finder) findInCacheMultithreaded(node pathMap, filter WalkFunc,
				1506	approxNumThreads int) []string {
				1507
				1508	if approxNumThreads < 2 {
				1509	// Done spawning threads; process remaining directories
				1510	return f.findInCacheSinglethreaded(node, filter)
				1511	}
				1512
				1513	totalWork := 0
				1514	for _, child := range node.children {
				1515	totalWork += child.approximateNumDescendents
				1516	}
				1517	childrenResults := make(chan []string, len(node.children))
				1518
				1519	subDirs, filePaths := f.listMatches(node, filter)
				1520
				1521	// process child directories
				1522	for _, child := range subDirs {
				1523	numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
				1524	childProcessor := func(child *pathMap) {
				1525	childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
				1526	childrenResults <- childResults
				1527	}
				1528	// If we're allowed to use more than 1 thread to process this directory,
				1529	// then instead we use 1 thread for each subdirectory.
				1530	// It would be strange to spawn threads for only some subdirectories.
				1531	go childProcessor(child)
				1532	}
				1533
				1534	// collect results
				1535	for i := 0; i < len(subDirs); i++ {
				1536	childResults := <-childrenResults
				1537	filePaths = append(filePaths, childResults...)
				1538	}
				1539	close(childrenResults)
				1540
				1541	return filePaths
				1542	}
				1543
				1544	// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
				1545	// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
				1546	func (f Finder) findInCacheSinglethreaded(node pathMap, filter WalkFunc) []string {
				1547	if node == nil {
				1548	return []string{}
				1549	}
				1550
				1551	nodes := []*pathMap{node}
				1552	matches := []string{}
				1553
				1554	for len(nodes) > 0 {
				1555	currentNode := nodes[0]
				1556	nodes = nodes[1:]
				1557
				1558	subDirs, filePaths := f.listMatches(currentNode, filter)
				1559
				1560	nodes = append(nodes, subDirs...)
				1561
				1562	matches = append(matches, filePaths...)
				1563	}
				1564	return matches
				1565	}