verify.js 6.57 KB
'use strict'

const util = require('util')

const pMap = require('p-map')
const contentPath = require('./content/path')
const fixOwner = require('./util/fix-owner')
const fs = require('@npmcli/fs')
const fsm = require('fs-minipass')
const glob = util.promisify(require('glob'))
const index = require('./entry-index')
const path = require('path')
const rimraf = util.promisify(require('rimraf'))
const ssri = require('ssri')

const globify = pattern => pattern.split('\\').join('/')

const hasOwnProperty = (obj, key) =>
  Object.prototype.hasOwnProperty.call(obj, key)

const verifyOpts = (opts) => ({
  concurrency: 20,
  log: { silly () {} },
  ...opts,
})

module.exports = verify

async function verify (cache, opts) {
  opts = verifyOpts(opts)
  opts.log.silly('verify', 'verifying cache at', cache)

  const steps = [
    markStartTime,
    fixPerms,
    garbageCollect,
    rebuildIndex,
    cleanTmp,
    writeVerifile,
    markEndTime,
  ]

  const stats = {}
  for (const step of steps) {
    const label = step.name
    const start = new Date()
    const s = await step(cache, opts)
    if (s) {
      Object.keys(s).forEach((k) => {
        stats[k] = s[k]
      })
    }
    const end = new Date()
    if (!stats.runTime) {
      stats.runTime = {}
    }
    stats.runTime[label] = end - start
  }
  stats.runTime.total = stats.endTime - stats.startTime
  opts.log.silly(
    'verify',
    'verification finished for',
    cache,
    'in',
    `${stats.runTime.total}ms`
  )
  return stats
}

async function markStartTime (cache, opts) {
  return { startTime: new Date() }
}

async function markEndTime (cache, opts) {
  return { endTime: new Date() }
}

async function fixPerms (cache, opts) {
  opts.log.silly('verify', 'fixing cache permissions')
  await fixOwner.mkdirfix(cache, cache)
  // TODO - fix file permissions too
  await fixOwner.chownr(cache, cache)
  return null
}

// Implements a naive mark-and-sweep tracing garbage collector.
//
// The algorithm is basically as follows:
// 1. Read (and filter) all index entries ("pointers")
// 2. Mark each integrity value as "live"
// 3. Read entire filesystem tree in `content-vX/` dir
// 4. If content is live, verify its checksum and delete it if it fails
// 5. If content is not marked as live, rimraf it.
//
async function garbageCollect (cache, opts) {
  opts.log.silly('verify', 'garbage collecting content')
  const indexStream = index.lsStream(cache)
  const liveContent = new Set()
  indexStream.on('data', (entry) => {
    if (opts.filter && !opts.filter(entry)) {
      return
    }

    liveContent.add(entry.integrity.toString())
  })
  await new Promise((resolve, reject) => {
    indexStream.on('end', resolve).on('error', reject)
  })
  const contentDir = contentPath.contentDir(cache)
  const files = await glob(globify(path.join(contentDir, '**')), {
    follow: false,
    nodir: true,
    nosort: true,
  })
  const stats = {
    verifiedContent: 0,
    reclaimedCount: 0,
    reclaimedSize: 0,
    badContentCount: 0,
    keptSize: 0,
  }
  await pMap(
    files,
    async (f) => {
      const split = f.split(/[/\\]/)
      const digest = split.slice(split.length - 3).join('')
      const algo = split[split.length - 4]
      const integrity = ssri.fromHex(digest, algo)
      if (liveContent.has(integrity.toString())) {
        const info = await verifyContent(f, integrity)
        if (!info.valid) {
          stats.reclaimedCount++
          stats.badContentCount++
          stats.reclaimedSize += info.size
        } else {
          stats.verifiedContent++
          stats.keptSize += info.size
        }
      } else {
        // No entries refer to this content. We can delete.
        stats.reclaimedCount++
        const s = await fs.stat(f)
        await rimraf(f)
        stats.reclaimedSize += s.size
      }
      return stats
    },
    { concurrency: opts.concurrency }
  )
  return stats
}

async function verifyContent (filepath, sri) {
  const contentInfo = {}
  try {
    const { size } = await fs.stat(filepath)
    contentInfo.size = size
    contentInfo.valid = true
    await ssri.checkStream(new fsm.ReadStream(filepath), sri)
  } catch (err) {
    if (err.code === 'ENOENT') {
      return { size: 0, valid: false }
    }
    if (err.code !== 'EINTEGRITY') {
      throw err
    }

    await rimraf(filepath)
    contentInfo.valid = false
  }
  return contentInfo
}

async function rebuildIndex (cache, opts) {
  opts.log.silly('verify', 'rebuilding index')
  const entries = await index.ls(cache)
  const stats = {
    missingContent: 0,
    rejectedEntries: 0,
    totalEntries: 0,
  }
  const buckets = {}
  for (const k in entries) {
    /* istanbul ignore else */
    if (hasOwnProperty(entries, k)) {
      const hashed = index.hashKey(k)
      const entry = entries[k]
      const excluded = opts.filter && !opts.filter(entry)
      excluded && stats.rejectedEntries++
      if (buckets[hashed] && !excluded) {
        buckets[hashed].push(entry)
      } else if (buckets[hashed] && excluded) {
        // skip
      } else if (excluded) {
        buckets[hashed] = []
        buckets[hashed]._path = index.bucketPath(cache, k)
      } else {
        buckets[hashed] = [entry]
        buckets[hashed]._path = index.bucketPath(cache, k)
      }
    }
  }
  await pMap(
    Object.keys(buckets),
    (key) => {
      return rebuildBucket(cache, buckets[key], stats, opts)
    },
    { concurrency: opts.concurrency }
  )
  return stats
}

async function rebuildBucket (cache, bucket, stats, opts) {
  await fs.truncate(bucket._path)
  // This needs to be serialized because cacache explicitly
  // lets very racy bucket conflicts clobber each other.
  for (const entry of bucket) {
    const content = contentPath(cache, entry.integrity)
    try {
      await fs.stat(content)
      await index.insert(cache, entry.key, entry.integrity, {
        metadata: entry.metadata,
        size: entry.size,
      })
      stats.totalEntries++
    } catch (err) {
      if (err.code === 'ENOENT') {
        stats.rejectedEntries++
        stats.missingContent++
      } else {
        throw err
      }
    }
  }
}

function cleanTmp (cache, opts) {
  opts.log.silly('verify', 'cleaning tmp directory')
  return rimraf(path.join(cache, 'tmp'))
}

function writeVerifile (cache, opts) {
  const verifile = path.join(cache, '_lastverified')
  opts.log.silly('verify', 'writing verifile to ' + verifile)
  try {
    return fs.writeFile(verifile, `${Date.now()}`)
  } finally {
    fixOwner.chownr.sync(cache, verifile)
  }
}

module.exports.lastRun = lastRun

async function lastRun (cache) {
  const data = await fs.readFile(path.join(cache, '_lastverified'), { encoding: 'utf8' })
  return new Date(+data)
}