common : remove hf cache migration (#23266)

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
Adrien Gallouët 2026-05-18 17:11:47 +02:00 committed by GitHub
parent 3a9c1b854d
commit 45b455e66f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 0 additions and 285 deletions

View file

@ -4,7 +4,6 @@
#include "chat.h"
#include "common.h"
#include "download.h"
#include "hf-cache.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
@ -586,12 +585,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// parse the first time to get -hf option (used for remote preset)
parse_cli_args();
// TODO: Remove later
try {
hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline);
} catch (const std::exception & e) {
LOG_WRN("HF cache migration failed: %s\n", e.what());
}
// export_graph_ops loads only metadata
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;

View file

@ -11,7 +11,6 @@
#include <filesystem>
#include <fstream>
#include <atomic>
#include <regex> // migration only
#include <string>
#include <string_view>
#include <stdexcept>
@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id,
if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
file.oid = item["lfs"]["oid"].get<std::string>();
}
if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
file.size = item["lfs"]["size"].get<size_t>();
}
} else if (item.contains("oid") && item["oid"].is_string()) {
file.oid = item["oid"].get<std::string>();
}
if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
file.size = item["size"].get<size_t>();
}
if (!file.oid.empty() && !is_valid_oid(file.oid)) {
LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) {
return file.final_path;
}
// delete everything after this line, one day
// copied from download.cpp without the tag part
struct gguf_split_info {
std::string prefix; // tag included
int index;
int count;
};
static gguf_split_info get_gguf_split_info(const std::string & path) {
static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
std::smatch m;
std::string prefix = path;
if (!string_remove_suffix(prefix, ".gguf")) {
return {};
}
int index = 1;
int count = 1;
if (std::regex_match(prefix, m, re_split)) {
index = std::stoi(m[2].str());
count = std::stoi(m[3].str());
prefix = m[1].str();
}
return {std::move(prefix), index, count};
}
static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
std::smatch match;
if (std::regex_match(filename, match, re)) {
return {match[1].str(), match[2].str()};
}
return {};
}
static std::string make_old_cache_filename(const std::string & owner,
const std::string & repo,
const std::string & filename) {
auto result = owner + "_" + repo + "_" + filename;
string_replace_all(result, "/", "_");
return result;
}
struct migrate_file {
std::string path;
std::string sha256;
size_t size;
fs::path old_path;
fs::path etag_path;
const hf_file * file;
};
using migrate_files = std::vector<migrate_file>;
static bool collect_file(const fs::path & old_cache,
const std::string & owner,
const std::string & repo,
const std::string & path,
const std::string & sha256,
const hf_files & files,
migrate_files & to_migrate) {
const hf_file * file = nullptr;
for (const auto & f : files) {
if (f.path == path) {
file = &f;
break;
}
}
std::string old_filename = make_old_cache_filename(owner, repo, path);
fs::path old_path = old_cache / old_filename;
fs::path etag_path = old_path.string() + ".etag";
if (!fs::exists(old_path)) {
if (file && fs::exists(file->final_path)) {
return true;
}
LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
return false;
}
if (!file) {
LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
return false;
}
if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
return false;
}
if (file->size > 0) {
size_t size = fs::file_size(old_path);
if (size != file->size) {
LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
return false;
}
}
to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
return true;
}
static bool collect_files(const fs::path & old_cache,
const std::string & owner,
const std::string & repo,
const nl::json & node,
const hf_files & files,
migrate_files & to_migrate) {
if (!node.contains("rfilename") ||
!node.contains("lfs") ||
!node["lfs"].contains("sha256")) {
return true;
}
std::string path = node["rfilename"];
std::string sha256 = node["lfs"]["sha256"];
auto split = get_gguf_split_info(path);
if (split.count <= 1) {
return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
}
std::vector<std::pair<std::string, std::string>> splits;
for (const auto & f : files) {
auto split_f = get_gguf_split_info(f.path);
if (split_f.count == split.count && split_f.prefix == split.prefix) {
// sadly the manifest only provides the sha256 of the first file (index == 1)
// the rest will be verified using the size...
std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
splits.emplace_back(f.path, f_sha256);
}
}
if ((int)splits.size() != split.count) {
LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
return false;
}
for (const auto & [f_path, f_sha256] : splits) {
if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
return false;
}
}
return true;
}
static bool migrate_file(const migrate_file & file) {
std::error_code ec;
fs::path new_path(file.file->local_path);
fs::create_directories(new_path.parent_path(), ec);
if (!fs::exists(new_path, ec)) {
fs::rename(file.old_path, new_path, ec);
if (ec) {
fs::copy_file(file.old_path, new_path, ec);
if (ec) {
LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
return false;
}
}
fs::remove(file.old_path, ec);
}
fs::remove(file.etag_path, ec);
std::string filename = finalize_file(*file.file);
LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
return true;
}
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
fs::path old_cache = fs_get_cache_directory();
if (!fs::exists(old_cache)) {
return;
}
if (offline) {
LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
return; // -hf is not going to work
}
bool warned = false;
for (const auto & entry : fs::directory_iterator(old_cache)) {
if (!entry.is_regular_file()) {
continue;
}
auto filename = entry.path().filename().string();
auto [owner, repo] = parse_manifest_name(filename);
if (owner.empty() || repo.empty()) {
continue;
}
if (!warned) {
warned = true;
LOG_WRN("================================================================================\n"
"WARNING: Migrating cache to HuggingFace cache directory\n"
" Old cache: %s\n"
" New cache: %s\n"
"This one-time migration moves models previously downloaded with -hf\n"
"from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
"Models downloaded with --model-url are not affected.\n"
"================================================================================\n",
old_cache.string().c_str(), get_cache_directory().string().c_str());
}
auto repo_id = owner + "/" + repo;
auto files = get_repo_files(repo_id, token);
if (files.empty()) {
LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
continue;
}
migrate_files to_migrate;
bool ok = true;
try {
std::ifstream manifest(entry.path());
auto json = nl::json::parse(manifest);
for (const char * key : {"ggufFile", "mmprojFile"}) {
if (json.contains(key)) {
if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
ok = false;
break;
}
}
}
} catch (const std::exception & e) {
LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
continue;
}
if (!ok) {
LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
continue;
}
for (const auto & file : to_migrate) {
if (!migrate_file(file)) {
ok = false;
break;
}
}
if (!ok) {
LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
continue;
}
LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
fs::remove(entry.path());
}
}
} // namespace hf_cache

View file

@ -14,7 +14,6 @@ struct hf_file {
std::string final_path;
std::string oid;
std::string repo_id;
size_t size = 0; // only for the migration
};
using hf_files = std::vector<hf_file>;
@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
// Create snapshot path (link or move/copy) and return it
std::string finalize_file(const hf_file & file);
// TODO: Remove later
void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
} // namespace hf_cache