Skip to content

Commit

Permalink
Changes to enable download of snapshot in validator.
Browse files Browse the repository at this point in the history
  • Loading branch information
dom96 committed Oct 7, 2024
1 parent 2178684 commit 290e49d
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 45 deletions.
43 changes: 11 additions & 32 deletions src/pyodide/internal/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -208,27 +208,10 @@ function recordDsoHandles(Module: Module): DylinkInfo {
// the linear memory snapshot has them already initialized.
// Can get this list by starting Python and filtering sys.modules for modules
// whose importer is not FrozenImporter or BuiltinImporter.
const SNAPSHOT_IMPORTS = [
'_pyodide.docstring',
'_pyodide._core_docs',
'traceback',
'collections.abc',
// Asyncio is the really slow one here. In native Python on my machine, `import asyncio` takes ~50
// ms.
'asyncio',
'inspect',
'tarfile',
'importlib.metadata',
're',
'shutil',
'sysconfig',
'importlib.machinery',
'pathlib',
'site',
'tempfile',
'typing',
'zipfile',
];
//
const SNAPSHOT_IMPORTS: string[] =
// @ts-ignore getSnapshotImports is a static method.
ArtifactBundler.constructor.getSnapshotImports();

/**
* Python modules do a lot of work the first time they are imported. The memory
Expand Down Expand Up @@ -276,17 +259,13 @@ function memorySnapshotDoImports(Module: Module): Array<string> {
MetadataReader.getNames()
);
const SNAPSHOT_IMPORTS_SET = new Set(SNAPSHOT_IMPORTS);
const importedModules: Array<string> = ArtifactBundler.constructor
// @ts-ignore parsePythonScriptImports is a static method.
.parsePythonScriptImports(MetadataReader.getWorkerFiles('py'))
.filter((module: string) => {
const moduleFilename = module.replace('.', '/') + '.py';
return (
!localModulePaths.has(moduleFilename) &&
module != 'js' &&
!SNAPSHOT_IMPORTS_SET.has(module)
);
});
const importedModules: Array<string> =
// @ts-ignore filterPythonScriptImportsJs is a static method.
ArtifactBundler.constructor.filterPythonScriptImportsJs(
ArtifactBundler.constructor
// @ts-ignore parsePythonScriptImports is a static method.
.parsePythonScriptImports(MetadataReader.getWorkerFiles('py'))
);

const deduplicatedModules = [...new Set(importedModules)];

Expand Down
73 changes: 68 additions & 5 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,76 @@ kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::St
}
}

// XXX: jsg doesn't support kj::Vector return types, so this seems to be the only way to do this.
auto builder = kj::heapArrayBuilder<kj::String>(result.size());
for (auto i = 0; i < result.size(); i++) {
builder.add(kj::mv(result[i]));
return result.releaseAsArray();
}

// This is equivalent to `pkgImport.replace('.', '/') + ".py"`.
kj::String importToModuleFilename(kj::StringPtr pkgImport) {
auto result = kj::heapString(pkgImport.size() + 3);
for (auto i = 0; i < pkgImport.size(); i++) {
if (pkgImport[i] == '.') {
result[i] = '/';
} else {
result[i] = pkgImport[i];
}
}

return builder.finish();
result[pkgImport.size()] = '.';
result[pkgImport.size() + 1] = 'p';
result[pkgImport.size()] = 'y';
return result;
}

kj::Array<kj::String> ArtifactBundler::filterPythonScriptImports(
kj::HashSet<kj::String> locals, kj::Array<kj::String> imports) {
kj::HashSet<kj::StringPtr> snapshotImportsSet;
for (auto& pkgImport: ArtifactBundler::getSnapshotImports()) {
snapshotImportsSet.insert(kj::mv(pkgImport));
}

kj::HashSet<kj::String> filteredImports;
for (auto& pkgImport: imports) {
if (filteredImports.contains(pkgImport)) [[unlikely]] {
continue; // Skip duplicates.
}

auto moduleFilename = importToModuleFilename(pkgImport);
if (!locals.contains(moduleFilename) && pkgImport != "js" &&
!snapshotImportsSet.contains(pkgImport)) {
filteredImports.insert(kj::mv(pkgImport));
}
}

kj::Vector<kj::String> filteredImportsVec;
for (auto& pkgImport: filteredImports) {
filteredImportsVec.add(kj::mv(pkgImport));
}

return filteredImportsVec.releaseAsArray();
}

kj::Array<kj::String> ArtifactBundler::filterPythonScriptImportsJs(
kj::Array<kj::String> locals, kj::Array<kj::String> imports) {
kj::HashSet<kj::String> localsSet;
for (auto& local: locals) {
localsSet.insert(kj::mv(local));
}
return ArtifactBundler::filterPythonScriptImports(kj::mv(localsSet), kj::mv(imports));
}

kj::Array<kj::StringPtr> ArtifactBundler::getSnapshotImports() {
kj::StringPtr imports[] = {"_pyodide.docstring"_kj, "_pyodide._core_docs"_kj, "traceback"_kj,
"collections.abc"_kj,
// Asyncio is the really slow one here. In native Python on my machine, `import asyncio` takes ~50
// ms.
"asyncio"_kj, "inspect"_kj, "tarfile"_kj, "importlib.metadata"_kj, "re"_kj, "shutil"_kj,
"sysconfig"_kj, "importlib.machinery"_kj, "pathlib"_kj, "site"_kj, "tempfile"_kj, "typing"_kj,
"zipfile"_kj};
kj::Vector<kj::StringPtr> result;
for (auto pkgImport: imports) {
result.add(pkgImport);
}
return result.releaseAsArray();
}

jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(
Expand Down
20 changes: 12 additions & 8 deletions src/workerd/api/pyodide/pyodide.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,11 @@ class ArtifactBundler: public jsg::Object {
kj::Maybe<MemorySnapshotResult> storedSnapshot;

ArtifactBundler(kj::Maybe<const PyodidePackageManager&> packageManager,
kj::Maybe<kj::Array<kj::byte>> existingSnapshot)
kj::Maybe<kj::Array<const kj::byte>> existingSnapshot,
bool isValidating = false)
: packageManager(packageManager),
storedSnapshot(kj::none),
existingSnapshot(kj::mv(existingSnapshot)),
isValidating(false) {};

ArtifactBundler(bool isValidating = false)
: packageManager(kj::none),
storedSnapshot(kj::none),
existingSnapshot(kj::none),
isValidating(isValidating) {};

void storeMemorySnapshot(jsg::Lock& js, MemorySnapshotResult snapshot) {
Expand Down Expand Up @@ -306,6 +301,13 @@ class ArtifactBundler: public jsg::Object {
//
// Package relative imports are ignored.
static kj::Array<kj::String> parsePythonScriptImports(kj::Array<kj::String> files);
// Takes in a list of imported modules and filters them in such a way to avoid local imports and
// redundant imports in the package snapshot list.
static kj::Array<kj::String> filterPythonScriptImports(
kj::HashSet<kj::String> locals, kj::Array<kj::String> imports);
static kj::Array<kj::String> filterPythonScriptImportsJs(
kj::Array<kj::String> locals, kj::Array<kj::String> imports);
static kj::Array<kj::StringPtr> getSnapshotImports();

JSG_RESOURCE_TYPE(ArtifactBundler) {
JSG_METHOD(hasMemorySnapshot);
Expand All @@ -317,12 +319,14 @@ class ArtifactBundler: public jsg::Object {
JSG_METHOD(isEnabled);
JSG_METHOD(getPackage);
JSG_STATIC_METHOD(parsePythonScriptImports);
JSG_STATIC_METHOD(filterPythonScriptImportsJs);
JSG_STATIC_METHOD(getSnapshotImports);
}

private:
// A memory snapshot of the state of the Python interpreter after initialisation. Used to speed
// up cold starts.
kj::Maybe<kj::Array<kj::byte>> existingSnapshot;
kj::Maybe<kj::Array<const kj::byte>> existingSnapshot;
bool isValidating;
};

Expand Down

0 comments on commit 290e49d

Please sign in to comment.