Implementation of the MaxViT architecture described in MaxViT: Multi-Axis Vision Transformer. The model performs image classification and by default returns logits for 1000 ImageNet classes.
model_maxvit(pretrained = FALSE, progress = TRUE, num_classes = 1000, ...)
Other classification_model:
model_alexnet()
,
model_convnext
,
model_efficientnet
,
model_efficientnet_v2
,
model_inception_v3()
,
model_mobilenet_v2()
,
model_mobilenet_v3
,
model_resnet
,
model_vgg
,
model_vit
if (FALSE) { # \dontrun{
library(magrittr)
# 1. Load the basketball image
img_url <- "https://upload.wikimedia.org/wikipedia/commons/7/7a/Basketball.png"
img <- base_loader(img_url)
# 2. Define normalization (ImageNet)
norm_mean <- c(0.485, 0.456, 0.406)
norm_std <- c(0.229, 0.224, 0.225)
# 3. Preprocess: convert to tensor, resize, Normalize
input <- img %>%
transform_to_tensor() %>%
transform_resize(c(400, 400)) %>%
transform_normalize(norm_mean, norm_std)
batch <- input$unsqueeze(1) # Add batch dimension (1, 3, H, W)
# 4. Display the image before normalization
tensor_image_browse(input)
# 5. Load MaxViT model
model <- model_maxvit(pretrained = TRUE)
model$eval()
# 6. Run inference
output <- model(batch)
topk <- output$topk(k = 5, dim = 2)
indices <- as.integer(topk[[2]][1, ])
scores <- as.numeric(topk[[1]][1, ])
# 7. Show Top-5 predictions
glue::glue("{seq_along(indices)}. {imagenet_label(indices)} ({round(scores, 2)}%)")
} # }