{"id":24636,"date":"2025-11-13T17:08:04","date_gmt":"2025-11-14T01:08:04","guid":{"rendered":"https:\/\/embeddedvisionsummit.com\/2026\/?page_id=24636"},"modified":"2026-05-07T07:45:48","modified_gmt":"2026-05-07T14:45:48","slug":"vlm-training","status":"publish","type":"page","link":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/","title":{"rendered":"Vision-Language Model Training"},"content":{"rendered":"\t\t<div data-elementor-type=\"wp-page\" data-elementor-id=\"24636\" class=\"elementor elementor-24636\" data-elementor-post-type=\"page\">\n\t\t\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-98ce238 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"98ce238\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-22dd413\" data-id=\"22dd413\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-0ca9ecd elementor-widget elementor-widget-heading\" data-id=\"0ca9ecd\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">We're excited to offer <u>two<\/u> half-day in-person vision-language model (VLM) trainings!<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-9a2ed0f elementor-widget elementor-widget-heading\" data-id=\"9a2ed0f\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">Note these trainings will be held at Cadence, 2655 Seely Ave, San Jose, CA 95131, about a 15-minute drive from the Santa Clara Convention Center.<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-e4ad11e elementor-widget-divider--view-line elementor-widget elementor-widget-divider\" data-id=\"e4ad11e\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"divider.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-divider\">\n\t\t\t<span class=\"elementor-divider-separator\">\n\t\t\t\t\t\t<\/span>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-fb7118d elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"fb7118d\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-aa468a1\" data-id=\"aa468a1\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-edd04d2 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"edd04d2\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-2cf94ae\" data-id=\"2cf94ae\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-ee2ca01 elementor-widget elementor-widget-heading\" data-id=\"ee2ca01\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\"><i>Intro Course<\/i><br>Vision-Language Models for Computer Vision Applications: A Hands-On Introduction<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-20bfca3 elementor-widget elementor-widget-heading\" data-id=\"20bfca3\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h3 class=\"elementor-heading-title elementor-size-default\">Wednesday, May 13, 2026, 9 am - noon <\/h3>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-09e53a0\" data-id=\"09e53a0\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-eeead92 elementor-widget elementor-widget-heading\" data-id=\"eeead92\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\"><i>Advanced Course<\/i><br>Vision-Language Models for Video Understanding and Agentic AI<br><\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-9839399 elementor-widget elementor-widget-heading\" data-id=\"9839399\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h3 class=\"elementor-heading-title elementor-size-default\">Wednesday, May 13, 2026, 1:30 - 5:00 pm<\/h3>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-df144d9 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"df144d9\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-4532568\" data-id=\"4532568\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-667fadc elementor-widget elementor-widget-text-editor\" data-id=\"667fadc\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"text-editor.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<p>Presented by:\u00a0<\/p>\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-290b948 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"290b948\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-4f9daec\" data-id=\"4f9daec\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-712f4df elementor-widget elementor-widget-image\" data-id=\"712f4df\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"300\" height=\"72\" src=\"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2020\/02\/EdgeAIVisionAlliance_logo_lg-e1741048220508-300x72.png\" class=\"attachment-medium size-medium wp-image-3515\" alt=\"\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-dc63d8e\" data-id=\"dc63d8e\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-b6084d4 elementor-widget elementor-widget-image\" data-id=\"b6084d4\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"image.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<img decoding=\"async\" width=\"300\" height=\"150\" src=\"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2024\/04\/logo_OpenCV-300x150.png\" class=\"attachment-medium size-medium wp-image-23103\" alt=\"\" \/>\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<div class=\"elementor-element elementor-element-1b153d3 elementor-widget-divider--view-line elementor-widget elementor-widget-divider\" data-id=\"1b153d3\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"divider.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-divider\">\n\t\t\t<span class=\"elementor-divider-separator\">\n\t\t\t\t\t\t<\/span>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-ea9d0d3 elementor-widget elementor-widget-heading\" data-id=\"ea9d0d3\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"heading.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t<h2 class=\"elementor-heading-title elementor-size-default\">See below for course details (click either Intro or Advanced tab).<\/h2>\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-d077e45 elementor-align-center elementor-widget elementor-widget-button\" data-id=\"d077e45\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"button.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-button-wrapper\">\n\t\t\t\t\t<a class=\"elementor-button elementor-button-link elementor-size-sm\" href=\"https:\/\/edge-ai-vision.swoogo.com\/2026EVS\/begin?reg_type_id=982078\">\n\t\t\t\t\t\t<span class=\"elementor-button-content-wrapper\">\n\t\t\t\t\t\t\t\t\t<span class=\"elementor-button-text\">Register Today<\/span>\n\t\t\t\t\t<\/span>\n\t\t\t\t\t<\/a>\n\t\t\t\t<\/div>\n\t\t\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-6b104fc elementor-tabs-view-horizontal elementor-widget elementor-widget-tabs\" data-id=\"6b104fc\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"tabs.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-tabs\">\n\t\t\t<div class=\"elementor-tabs-wrapper\" role=\"tablist\" >\n\t\t\t\t\t\t\t\t\t<div id=\"elementor-tab-title-1121\" class=\"elementor-tab-title elementor-tab-desktop-title\" aria-selected=\"true\" data-tab=\"1\" role=\"tab\" tabindex=\"0\" aria-controls=\"elementor-tab-content-1121\" aria-expanded=\"false\">Intro Course Description<\/div>\n\t\t\t\t\t\t\t\t\t<div id=\"elementor-tab-title-1122\" class=\"elementor-tab-title elementor-tab-desktop-title\" aria-selected=\"false\" data-tab=\"2\" role=\"tab\" tabindex=\"-1\" aria-controls=\"elementor-tab-content-1122\" aria-expanded=\"false\">Advanced Course Description<\/div>\n\t\t\t\t\t\t\t<\/div>\n\t\t\t<div class=\"elementor-tabs-content-wrapper\" role=\"tablist\" aria-orientation=\"vertical\">\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-tab-title elementor-tab-mobile-title\" aria-selected=\"true\" data-tab=\"1\" role=\"tab\" tabindex=\"0\" aria-controls=\"elementor-tab-content-1121\" aria-expanded=\"false\">Intro Course Description<\/div>\n\t\t\t\t\t<div id=\"elementor-tab-content-1121\" class=\"elementor-tab-content elementor-clearfix\" data-tab=\"1\" role=\"tabpanel\" aria-labelledby=\"elementor-tab-title-1121\" tabindex=\"0\" hidden=\"false\"><section class=\"elementor-section elementor-top-section elementor-element elementor-element-df144d9 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"df144d9\" data-element_type=\"section\"><div class=\"elementor-container elementor-column-gap-default\"><div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-4532568\" data-id=\"4532568\" data-element_type=\"column\"><div class=\"elementor-widget-wrap elementor-element-populated\"><div class=\"elementor-element elementor-element-0ba8ff3 elementor-widget elementor-widget-text-editor\" data-id=\"0ba8ff3\" data-element_type=\"widget\" data-widget_type=\"text-editor.default\"><div class=\"elementor-widget-container\"><h2 data-elementor-setting-key=\"title\" data-pen-placeholder=\"Type Here...\"><span style=\"font-family: Jost, sans-serif; font-size: 30px; font-weight: 600; letter-spacing: 0px;\">Vision-Language Models for Computer Vision Applications: A Hands-On Introduction<\/span><\/h2><p>(Looking for the Advanced course?\u00a0 Click the tab &#8220;Advanced Course Description&#8221; above!)<\/p><p>Are you an engineer, developer or engineering manager eager to harness the power of generative AI for cutting-edge computer vision applications? Join us for an intensive three-hour training session designed to introduce the latest techniques in vision-language models (VLMs) and their integration with traditional computer vision methods. With a focus on the practical application of these techniques for real-world use cases, this course is tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.<\/p><p><b>What You\u2019ll Learn<\/b><\/p><p><b>Introduction to VLMs and LLM+Computer Vision Techniques with Phil Lapsley, Vice President, Edge AI and Vision Alliance:<\/b><\/p><ul><li>We\u2019ll start with an overview of vision-language models and how they differ from conventional convolutional neural networks. We\u2019ll then discuss the advantages and potential drawbacks of integrating LLMs and VLMs with computer vision and explore real-world applications that benefit from these advanced techniques.<\/li><\/ul><p><b>Technical Deep Dive with Dr. Satya Mallick, CEO of OpenCV:<\/b><\/p><ul><li><b>Gain insights<\/b> into the basics of VLMs, including embeddings, CLIP and how different modalities (text, vision) are encoded. Learn about the types of training data required and the loss functions used in these models. This segment will provide the necessary background to tackle the practical examples that follow.<\/li><li><b>First Hands-On Example: Zero-Shot Image Classification.<\/b>\u00a0Our first practical example will be image classification with CLIP for zero-shot learning. You\u2019ll build an image classifier capable of recognizing a wide array of images without prior training. Discover how CLIP\u2019s zero-shot classification can be deployed on mobile devices and learn how to fine-tune the model for enhanced performance on specific datasets.<\/li><li><b>Second Hands-On Example: VLM with Agnostic Object Detector.<\/b>\u00a0We\u2019ll develop a VLM-based visual AI system that identifies objects and reasons about them using pre-existing world knowledge. We\u2019ll accomplish this by using a CNN-based class-agnostic object detector and integrating it with a VLM to answer complex questions about detected objects.<\/li><\/ul><p><b>Who Should Attend<br \/><\/b><span style=\"font-size: 17.3056px;\">This training is ideal for engineers, developers, engineering managers and CTOs with a basic understanding of Python, Jupyter Notebook and computer vision concepts. Whether you\u2019re working in mobile development, embedded systems or cloud applications, this course will provide you with the tools and knowledge to implement sophisticated AI solutions in your projects.<\/span><\/p><p><b>To make the most out of this training, you should have:<\/b><\/p><ul><li>Working knowledge of Python<\/li><li>Basic familiarity with Jupyter Notebook or Google Colab<\/li><li>Basic familiarity with computer vision; familiarity with OpenCV and PyTorch is helpful but not required<\/li><\/ul><p><b>Why Attend?<br \/><\/b><span style=\"font-size: 17.3056px;\">The field of generative AI and multimodal LLMs is moving at a truly breakneck pace. This course offers a great way to keep up with this rapidly evolving technical landscape. In particular, it provides a unique blend of foundational knowledge and practical applications, ensuring you leave with actionable skills and access to sample code for continued learning.<\/span><\/p><p><b><a href=\"https:\/\/edge-ai-vision.swoogo.com\/2026EVS\/begin?reg_type_id=982078\">Register Today<\/a><br \/><\/b><span style=\"font-size: 17.3056px;\">Registration is $495. Don\u2019t miss this opportunity to enhance your skills and stay at the forefront of computer vision technology. Register today to secure your spot in this transformative training session. (You can save $50 if you also register for the Embedded Vision Summit!)<\/span><\/p><p><strong>Important:<\/strong> Please note that this training will be held at <span style=\"color: #c45656;\"><strong>Cadence, 2655 Seely Ave, San Jose, CA 95131,<\/strong><\/span> about a 15-minute drive from the Santa Clara Convention Center.<\/p><\/div><\/div><\/div><\/div><\/div><\/section><section class=\"elementor-section elementor-top-section elementor-element elementor-element-6254ded elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"6254ded\" data-element_type=\"section\"><div class=\"elementor-container elementor-column-gap-default\"><div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-61e8845\" data-id=\"61e8845\" data-element_type=\"column\"><div class=\"elementor-widget-wrap elementor-element-populated\"><div class=\"elementor-element elementor-element-20676b7 elementor-widget elementor-widget-spacer\" data-id=\"20676b7\" data-element_type=\"widget\" data-widget_type=\"spacer.default\"><div class=\"elementor-widget-container\"><div class=\"elementor-spacer\">\u00a0<\/div><\/div><\/div><\/div><\/div><\/div><\/section><\/div>\n\t\t\t\t\t\t\t\t\t<div class=\"elementor-tab-title elementor-tab-mobile-title\" aria-selected=\"false\" data-tab=\"2\" role=\"tab\" tabindex=\"-1\" aria-controls=\"elementor-tab-content-1122\" aria-expanded=\"false\">Advanced Course Description<\/div>\n\t\t\t\t\t<div id=\"elementor-tab-content-1122\" class=\"elementor-tab-content elementor-clearfix\" data-tab=\"2\" role=\"tabpanel\" aria-labelledby=\"elementor-tab-title-1122\" tabindex=\"0\" hidden=\"hidden\"><h2 data-elementor-setting-key=\"title\" data-pen-placeholder=\"Type Here...\">Vision-Language Models for Video Understanding and Agentic AI<\/h2><div><p style=\"font-weight: 400; font-style: normal; font-size: 17.3056px; font-family: Roboto, Helvetica, Arial, sans-serif;\">(Looking for the Intro course?\u00a0 Click the tab &#8220;Intro Course Description&#8221; above!)<\/p><\/div><p>Ready to move beyond CLIP-style \u201cimage + caption\u201d alignment and build vision-language systems that reason over time and take actions? This live, hands-on advanced training dives into modern VLMs designed for video understanding, multimodal reasoning, and agentic behavior. You\u2019ll learn how today\u2019s \u201cthinking\u201d vision models differ from classic VLM pipelines\u2014and you\u2019ll implement practical patterns for building applications that observe, reason, decide, and adapt.<\/p><p>This session is taught by Satya Mallick, CEO of OpenCV, and focuses exclusively on advanced, real-world capabilities using state-of-the-art models such as Qwen3.5, GLM-4.1V-Thinking, and LLaVA-NeXT.<\/p><p>This is an advanced course. If you haven\u2019t taken the introductory VLM session, we strongly recommend it\u2014or ensure you meet the prerequisites below.<\/p><p><b style=\"font-family: inherit; font-size: 17.3056px; font-style: inherit;\">What You\u2019ll Learn<\/b><\/p><p><b>Advanced VLMs: From Perception to Reasoning to Decision<br \/><\/b><span style=\"font-size: 17.3056px;\">We\u2019ll start by reframing what \u201cmodern VLMs\u201d are actually good at\u2014especially where classic approaches break:<\/span><\/p><ul><li>Why video + time changes the problem (and why frame-by-frame logic fails)<\/li><li>When to use reasoning VLMs vs. conventional CV pipelines<\/li><li>Practical capability vs. cost tradeoffs in \u201cthinking\u201d models<\/li><\/ul><p><b>Reasoning VLMs (and What \u201cThinking\u201d Really Means)<br \/><\/b><span style=\"font-size: 17.3056px;\">Get a clear, engineer-focused understanding of:<\/span><\/p><ul><li>Visual chain-of-thought concepts (latent vs. explicit reasoning)<\/li><li>How different model families behave in practice<\/li><li>How to choose between Qwen3-VL (workhorse), GLM-4.1V (reasoning contrast), and LLaVA-NeXT (architecture\/efficiency)<\/li><\/ul><p><b>Video VLMs and Temporal Reasoning<br \/><\/b><span style=\"font-size: 17.3056px;\">You\u2019ll learn and implement techniques to handle the hard parts of video understanding:<\/span><\/p><ul><li>Temporal grounding, event segmentation, and \u201cwhat changed and when?\u201d<\/li><li>Avoiding causal hallucinations and frame-sampling pitfalls<\/li><li>Building a Video Q&amp;A demo that answers \u201cwhy,\u201d not just \u201cwhat\u201d<\/li><\/ul><p><b>Agentic VLMs: Observe \u2192 Reason \u2192 Decide \u2192 Act \u2192 Remember<br \/><\/b><span style=\"font-size: 17.3056px;\">Go from \u201cmodel outputs\u201d to systems with behavior:<\/span><\/p><ul><li>Why agentic VLMs replace brittle rules with adaptive policies<\/li><li>Persistent state, memory buffers, tool usage, and feedback loops<\/li><li>Building a monitoring application that watches a video stream and decides when to raise alerts\u2014driven by language-based decision logic, not hard-coded rules:<\/li><\/ul><ol><li>detect abnormal behavior<\/li><li>explain decisions<\/li><li>modify behavior based on past observations (e.g., temporary exceptions via an admin instruction)<\/li><\/ol><p><b>Who Should Attend<br \/><\/b><span style=\"font-size: 17.3056px;\">This workshop is ideal for engineers, developers, and technical leads who want to build the next generation of:<\/span><\/p><ul><li>video understanding and \u201cwhat happened \/ why \/ what next\u201d applications<\/li><li>intelligent monitoring and safety systems<\/li><li>adaptive, agent-like multimodal products<\/li><\/ul><p><b>Prerequisites<br \/><\/b><span style=\"font-size: 17.3056px;\">To get the most from the hands-on work, you should be:<\/span><\/p><ul><li>comfortable with Python<\/li><li>familiar with basic VLM concepts (e.g., CLIP and embeddings at a high level)<\/li><li>interested in reasoning, temporal understanding, and agentic systems<\/li><\/ul><p><b>Why Attend?<br \/><\/b><span style=\"font-size: 17.3056px;\">Multimodal AI is rapidly shifting from static perception to temporal reasoning and autonomous decision-making. This course helps you keep pace with where VLMs are going\u2014while staying grounded in implementable techniques, real tradeoffs, and working applications you can build on after the session.<\/span><\/p><p><b><a href=\"https:\/\/edge-ai-vision.swoogo.com\/2026EVS\/begin?reg_type_id=982078\">Register Today<\/a><br \/><\/b>Registration is $495. Don\u2019t miss this opportunity to enhance your skills and stay at the forefront of computer vision technology. Register today to secure your spot in this transformative training session. (You can save $50 if you also register for the Embedded Vision Summit!)<\/p><p><strong>Important:<\/strong> Please note that this training will be held at <span style=\"color: #c45656;\"><strong>Cadence, 2655 Seely Ave, San Jose, CA 95131,<\/strong><\/span> about a 15-minute drive from the Santa Clara Convention Center.<\/p><\/div>\n\t\t\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<section class=\"elementor-section elementor-inner-section elementor-element elementor-element-26f3559 elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"26f3559\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-16afe64\" data-id=\"16afe64\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap\">\n\t\t\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t<div class=\"elementor-column elementor-col-50 elementor-inner-column elementor-element elementor-element-14e2939\" data-id=\"14e2939\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap\">\n\t\t\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<section class=\"elementor-section elementor-top-section elementor-element elementor-element-6254ded elementor-section-boxed elementor-section-height-default elementor-section-height-default\" data-id=\"6254ded\" data-element_type=\"section\" data-e-type=\"section\">\n\t\t\t\t\t\t<div class=\"elementor-container elementor-column-gap-default\">\n\t\t\t\t\t<div class=\"elementor-column elementor-col-100 elementor-top-column elementor-element elementor-element-61e8845\" data-id=\"61e8845\" data-element_type=\"column\" data-e-type=\"column\">\n\t\t\t<div class=\"elementor-widget-wrap elementor-element-populated\">\n\t\t\t\t\t\t<div class=\"elementor-element elementor-element-9c59305 elementor-widget elementor-widget-spacer\" data-id=\"9c59305\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t<div class=\"elementor-element elementor-element-20676b7 elementor-widget elementor-widget-spacer\" data-id=\"20676b7\" data-element_type=\"widget\" data-e-type=\"widget\" data-widget_type=\"spacer.default\">\n\t\t\t\t<div class=\"elementor-widget-container\">\n\t\t\t\t\t\t\t<div class=\"elementor-spacer\">\n\t\t\t<div class=\"elementor-spacer-inner\"><\/div>\n\t\t<\/div>\n\t\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t<\/section>\n\t\t\t\t<\/div>\n\t\t","protected":false},"excerpt":{"rendered":"<p>We&#8217;re excited to offer two half-day in-person vision-language model (VLM) trainings! Note these trainings will be held at Cadence, 2655 Seely Ave, San Jose, CA 95131, about a 15-minute drive from the Santa Clara Convention Center. Intro CourseVision-Language Models for [&hellip;]<\/p>\n","protected":false},"author":13,"featured_media":26227,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"content-type":"","footnotes":""},"class_list":["post-24636","page","type-page","status-publish","has-post-thumbnail","hentry"],"yoast_head":"<!-- This site is optimized with the Yoast SEO plugin v27.5 - https:\/\/yoast.com\/product\/yoast-seo-wordpress\/ -->\n<title>Vision-Language Model Training - 2026 Summit<\/title>\n<meta name=\"description\" content=\"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.\" \/>\n<meta name=\"robots\" content=\"index, follow, max-snippet:-1, max-image-preview:large, max-video-preview:-1\" \/>\n<link rel=\"canonical\" href=\"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/\" \/>\n<meta property=\"og:locale\" content=\"en_US\" \/>\n<meta property=\"og:type\" content=\"article\" \/>\n<meta property=\"og:title\" content=\"Vision-Language Model Training - 2026 Summit\" \/>\n<meta property=\"og:description\" content=\"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.\" \/>\n<meta property=\"og:url\" content=\"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/\" \/>\n<meta property=\"og:site_name\" content=\"2026 Summit\" \/>\n<meta property=\"article:modified_time\" content=\"2026-05-07T14:45:48+00:00\" \/>\n<meta property=\"og:image\" content=\"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2025\/11\/2026_SummitGeneral_1200x630_V3-2.png\" \/>\n\t<meta property=\"og:image:width\" content=\"1200\" \/>\n\t<meta property=\"og:image:height\" content=\"630\" \/>\n\t<meta property=\"og:image:type\" content=\"image\/png\" \/>\n<meta name=\"twitter:card\" content=\"summary_large_image\" \/>\n<meta name=\"twitter:label1\" content=\"Est. reading time\" \/>\n\t<meta name=\"twitter:data1\" content=\"9 minutes\" \/>\n<script type=\"application\/ld+json\" class=\"yoast-schema-graph\">{\"@context\":\"https:\\\/\\\/schema.org\",\"@graph\":[{\"@type\":\"WebPage\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/\",\"url\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/\",\"name\":\"Vision-Language Model Training - 2026 Summit\",\"isPartOf\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#website\"},\"primaryImageOfPage\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/#primaryimage\"},\"image\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/#primaryimage\"},\"thumbnailUrl\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/wp-content\\\/uploads\\\/sites\\\/16\\\/2025\\\/11\\\/2026_SummitGeneral_1200x630_V3-2.png\",\"datePublished\":\"2025-11-14T01:08:04+00:00\",\"dateModified\":\"2026-05-07T14:45:48+00:00\",\"description\":\"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.\",\"breadcrumb\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/#breadcrumb\"},\"inLanguage\":\"en-US\",\"potentialAction\":[{\"@type\":\"ReadAction\",\"target\":[\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/\"]}]},{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/#primaryimage\",\"url\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/wp-content\\\/uploads\\\/sites\\\/16\\\/2025\\\/11\\\/2026_SummitGeneral_1200x630_V3-2.png\",\"contentUrl\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/wp-content\\\/uploads\\\/sites\\\/16\\\/2025\\\/11\\\/2026_SummitGeneral_1200x630_V3-2.png\",\"width\":1200,\"height\":630},{\"@type\":\"BreadcrumbList\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/vlm-training\\\/#breadcrumb\",\"itemListElement\":[{\"@type\":\"ListItem\",\"position\":1,\"name\":\"Home\",\"item\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/\"},{\"@type\":\"ListItem\",\"position\":2,\"name\":\"Vision-Language Model Training\"}]},{\"@type\":\"WebSite\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#website\",\"url\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/\",\"name\":\"2026 Summit\",\"description\":\"The premier conference for innovators incorporating computer vision and AI in products.\",\"publisher\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#organization\"},\"potentialAction\":[{\"@type\":\"SearchAction\",\"target\":{\"@type\":\"EntryPoint\",\"urlTemplate\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/?s={search_term_string}\"},\"query-input\":{\"@type\":\"PropertyValueSpecification\",\"valueRequired\":true,\"valueName\":\"search_term_string\"}}],\"inLanguage\":\"en-US\"},{\"@type\":\"Organization\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#organization\",\"name\":\"2026 Summit\",\"url\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/\",\"logo\":{\"@type\":\"ImageObject\",\"inLanguage\":\"en-US\",\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#\\\/schema\\\/logo\\\/image\\\/\",\"url\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/wp-content\\\/uploads\\\/sites\\\/16\\\/2022\\\/10\\\/Circle-Artwork-04-1024x958.png\",\"contentUrl\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/wp-content\\\/uploads\\\/sites\\\/16\\\/2022\\\/10\\\/Circle-Artwork-04-1024x958.png\",\"width\":1024,\"height\":958,\"caption\":\"2026 Summit\"},\"image\":{\"@id\":\"https:\\\/\\\/embeddedvisionsummit.com\\\/2026\\\/#\\\/schema\\\/logo\\\/image\\\/\"}}]}<\/script>\n<!-- \/ Yoast SEO plugin. -->","yoast_head_json":{"title":"Vision-Language Model Training - 2026 Summit","description":"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.","robots":{"index":"index","follow":"follow","max-snippet":"max-snippet:-1","max-image-preview":"max-image-preview:large","max-video-preview":"max-video-preview:-1"},"canonical":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/","og_locale":"en_US","og_type":"article","og_title":"Vision-Language Model Training - 2026 Summit","og_description":"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.","og_url":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/","og_site_name":"2026 Summit","article_modified_time":"2026-05-07T14:45:48+00:00","og_image":[{"width":1200,"height":630,"url":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2025\/11\/2026_SummitGeneral_1200x630_V3-2.png","type":"image\/png"}],"twitter_card":"summary_large_image","twitter_misc":{"Est. reading time":"9 minutes"},"schema":{"@context":"https:\/\/schema.org","@graph":[{"@type":"WebPage","@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/","url":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/","name":"Vision-Language Model Training - 2026 Summit","isPartOf":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/#website"},"primaryImageOfPage":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/#primaryimage"},"image":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/#primaryimage"},"thumbnailUrl":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2025\/11\/2026_SummitGeneral_1200x630_V3-2.png","datePublished":"2025-11-14T01:08:04+00:00","dateModified":"2026-05-07T14:45:48+00:00","description":"Choose from two training sessions designed to introduce the latest techniques in vision-language models (VLMs) plus their integration with traditional computer vision methods, as well as video understanding and agentic visual AI. With a focus on the practical application of these techniques for real-world applications, these courses are tailored for professionals looking to expand their skill set in AI-driven computer vision, particularly in systems designed for deployment at the edge.","breadcrumb":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/"]}]},{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/#primaryimage","url":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2025\/11\/2026_SummitGeneral_1200x630_V3-2.png","contentUrl":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2025\/11\/2026_SummitGeneral_1200x630_V3-2.png","width":1200,"height":630},{"@type":"BreadcrumbList","@id":"https:\/\/embeddedvisionsummit.com\/2026\/vlm-training\/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https:\/\/embeddedvisionsummit.com\/2026\/"},{"@type":"ListItem","position":2,"name":"Vision-Language Model Training"}]},{"@type":"WebSite","@id":"https:\/\/embeddedvisionsummit.com\/2026\/#website","url":"https:\/\/embeddedvisionsummit.com\/2026\/","name":"2026 Summit","description":"The premier conference for innovators incorporating computer vision and AI in products.","publisher":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/#organization"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https:\/\/embeddedvisionsummit.com\/2026\/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"},{"@type":"Organization","@id":"https:\/\/embeddedvisionsummit.com\/2026\/#organization","name":"2026 Summit","url":"https:\/\/embeddedvisionsummit.com\/2026\/","logo":{"@type":"ImageObject","inLanguage":"en-US","@id":"https:\/\/embeddedvisionsummit.com\/2026\/#\/schema\/logo\/image\/","url":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2022\/10\/Circle-Artwork-04-1024x958.png","contentUrl":"https:\/\/embeddedvisionsummit.com\/2026\/wp-content\/uploads\/sites\/16\/2022\/10\/Circle-Artwork-04-1024x958.png","width":1024,"height":958,"caption":"2026 Summit"},"image":{"@id":"https:\/\/embeddedvisionsummit.com\/2026\/#\/schema\/logo\/image\/"}}]}},"_links":{"self":[{"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/pages\/24636","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/users\/13"}],"replies":[{"embeddable":true,"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/comments?post=24636"}],"version-history":[{"count":10,"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/pages\/24636\/revisions"}],"predecessor-version":[{"id":28024,"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/pages\/24636\/revisions\/28024"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/media\/26227"}],"wp:attachment":[{"href":"https:\/\/embeddedvisionsummit.com\/2026\/wp-json\/wp\/v2\/media?parent=24636"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}