importjsonimporttorchfrompytorchvideo.data.encoded_videoimportEncodedVideofrompytorchvideo.transformsimport(ApplyTransformToKey,ShortSideScale,UniformTemporalSubsample)fromtorchvision.transformsimportCompose,Lambdafromtorchvision.transforms._transforms_videoimport(CenterCropVideo,NormalizeVideo,)# Device on which to run the model
# Set to cuda to load on GPU
device="cpu"# Pick a pretrained model and load the pretrained weights
# Use torch.hub.list("facebookresearch/pytorchvideo:main") to check all available models. Take care about the data pre-processing required by models.
model_name="mvit_base_16x4"model=torch.hub.load("facebookresearch/pytorchvideo",model=model_name,pretrained=True)# Set to eval mode and move to desired device
model=model.to(device)model=model.eval()withopen("kinetics_classnames.json","r")asf:kinetics_classnames=json.load(f)# Create an id to label name mapping
kinetics_id_to_classname={}fork,vinkinetics_classnames.items():kinetics_id_to_classname[v]=str(k).replace('"',"")####################
# SlowFast transform
####################
side_size=256mean=[0.45,0.45,0.45]std=[0.225,0.225,0.225]crop_size=224num_frames=16sampling_rate=4frames_per_second=30alpha=4transform=ApplyTransformToKey(key="video",transform=Compose([UniformTemporalSubsample(num_frames),Lambda(lambdax:x/255.0),NormalizeVideo(mean,std),ShortSideScale(size=side_size),CenterCropVideo(crop_size),]),)# The duration of the input clip is also specific to the model.
clip_duration=(num_frames*sampling_rate)/frames_per_secondvideo_path="archery.mp4"# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec=0end_sec=start_sec+clip_duration# Initialize an EncodedVideo helper class
video=EncodedVideo.from_path(video_path)# Load the desired clip
video_data=video.get_clip(start_sec=start_sec,end_sec=end_sec)# Apply a transform to normalize the video input
video_data=transform(video_data)# Move the inputs to the desired device
inputs=video_data["video"]inputs=inputs.to(device)preds=model(inputs[None,...])# Get the predicted classes
post_act=torch.nn.Softmax(dim=1)preds=post_act(preds)pred_classes=preds.topk(k=5).indices# Map the predicted classes to the label names
pred_class_names=[kinetics_id_to_classname[int(i)]foriinpred_classes[0]]print("Predicted labels: %s"%", ".join(pred_class_names))