diff --git a/.travis.yml b/.travis.yml
index 8d6ddb2f201..6b50d49e143 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -52,6 +52,7 @@ matrix:
             # Common
             - xz-utils
             - libexpat1-dev
+            - libx11-xcb-dev
             - libelf-dev
             - python3.5
             - python3-pip
@@ -120,7 +121,6 @@ matrix:
             - llvm-6.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -150,7 +150,6 @@ matrix:
             - llvm-6.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -183,7 +182,6 @@ matrix:
             - llvm-3.9-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -222,7 +220,6 @@ matrix:
             - libclang-3.9-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -258,7 +255,6 @@ matrix:
             - libclang-4.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -294,7 +290,6 @@ matrix:
             - libclang-5.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -327,7 +322,6 @@ matrix:
             - libclang-6.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -361,7 +355,6 @@ matrix:
             - libclang-7-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -397,7 +390,6 @@ matrix:
             - libedit-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -427,7 +419,6 @@ matrix:
             - llvm-6.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
diff --git a/Android.common.mk b/Android.common.mk
index aa1b266a393..d7c5f20fabc 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -31,6 +31,7 @@ LOCAL_C_INCLUDES += \
 
 MESA_VERSION := $(shell cat $(MESA_TOP)/VERSION)
 LOCAL_CFLAGS += \
+	-O3 \
 	-Wno-error \
 	-Wno-unused-parameter \
 	-Wno-pointer-arith \
@@ -78,14 +79,23 @@ LOCAL_CFLAGS += \
 	-fvisibility=hidden \
 	-fno-math-errno \
 	-fno-trapping-math \
-	-Wno-sign-compare
+	-Wno-sign-compare \
+	-Wno-self-assign \
+	-Wno-constant-logical-operand \
+	-Wno-format \
+	-Wno-incompatible-pointer-types \
+	-Wno-enum-conversion
 
 LOCAL_CPPFLAGS += \
 	-D__STDC_CONSTANT_MACROS \
 	-D__STDC_FORMAT_MACROS \
 	-D__STDC_LIMIT_MACROS \
 	-Wno-error=non-virtual-dtor \
-	-Wno-non-virtual-dtor
+	-Wno-non-virtual-dtor	\
+	-Wno-delete-non-virtual-dtor \
+	-Wno-overloaded-virtual \
+	-Wno-missing-braces \
+	-Wno-deprecated-register
 
 # mesa requires at least c99 compiler
 LOCAL_CONLYFLAGS += \
@@ -112,7 +122,7 @@ LOCAL_CFLAGS_arm64 += -DUSE_AARCH64_ASM
 
 ifneq ($(LOCAL_IS_HOST_MODULE),true)
 LOCAL_CFLAGS += -DHAVE_LIBDRM
-LOCAL_SHARED_LIBRARIES += libdrm
+LOCAL_SHARED_LIBRARIES += libdrm_pri
 endif
 
 LOCAL_CFLAGS_32 += -DDEFAULT_DRIVER_DIR=\"/vendor/lib/$(MESA_DRI_MODULE_REL_PATH)\"
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 00000000000..5df295abc3a
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,2 @@
+Any security related issues should be reported by following the instructions here:
+https://01.org/security
diff --git a/VERSION b/VERSION
index 8b16de0851f..a19b2d9a021 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-18.3.0-devel
+18.3.2
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
new file mode 100644
index 00000000000..92456c5c938
--- /dev/null
+++ b/bin/.cherry-ignore
@@ -0,0 +1,4 @@
+# fixes: Commit was squashed into the respective offenders
+c02390f8fcd367c7350db568feabb2f062efca14 egl/wayland: rather obvious build fix
+# fixes: The commit addresses b4476138d5ad3f8d30c14ee61f2f375edfdbab2a
+ff6f1dd0d3c6b4c15ca51b478b2884d14f6a1e06 meson: libfreedreno depends upon libdrm (for fence support)
diff --git a/bin/get-fixes-pick-list.sh b/bin/get-fixes-pick-list.sh
deleted file mode 100755
index 047ea3bec10..00000000000
--- a/bin/get-fixes-pick-list.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/sh
-
-# Script for generating a list of candidates [referenced by a Fixes tag] for
-# cherry-picking to a stable branch
-#
-# Usage examples:
-#
-# $ bin/get-fixes-pick-list.sh
-# $ bin/get-fixes-pick-list.sh > picklist
-# $ bin/get-fixes-pick-list.sh | tee picklist
-
-# Use the last branchpoint as our limit for the search
-latest_branchpoint=`git merge-base origin/master HEAD`
-
-# List all the commits between day 1 and the branch point...
-git log --reverse --pretty=%H $latest_branchpoint > already_landed
-
-# ... and the ones cherry-picked.
-git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
-	grep "cherry picked from commit" |\
-	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//'  > already_picked
-
-# Grep for commits with Fixes tag
-git log --reverse --pretty=%H -i --grep="fixes:" $latest_branchpoint..origin/master |\
-while read sha
-do
-	# Check to see whether the patch is on the ignore list ...
-	if [ -f bin/.cherry-ignore ] ; then
-		if grep -q ^$sha bin/.cherry-ignore ; then
-			continue
-		fi
-	fi
-
-	# Skip if it has been already cherry-picked.
-	if grep -q ^$sha already_picked ; then
-		continue
-	fi
-
-	# Place every "fixes:" tag on its own line and join with the next word
-	# on its line or a later one.
-	fixes=`git show --pretty=medium -s $sha | tr -d "\n" | sed -e 's/fixes:[[:space:]]*/\nfixes:/Ig' | grep "fixes:" | sed -e 's/\(fixes:[a-zA-Z0-9]*\).*$/\1/'`
-
-	# For each one try to extract the tag
-	fixes_count=`echo "$fixes" | wc -l`
-	warn=`(test $fixes_count -gt 1 && echo $fixes_count) || echo 0`
-	while [ $fixes_count -gt 0 ] ; do
-		# Treat only the current line
-		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
-		fixes_count=$(($fixes_count-1))
-
-		# Bail out if we cannot find suitable id.
-		# Any specific validation the $id is valid and not some junk, is
-		# implied with the follow up code
-		if [ "x$id" = x ] ; then
-			continue
-		fi
-
-		# Check if the offending commit is in branch.
-
-		# Be that cherry-picked ...
-		# ... or landed before the branchpoint.
-		if grep -q ^$id already_picked ||
-		   grep -q ^$id already_landed ; then
-
-			printf "Commit \"%s\" fixes %s\n" \
-			       "`git log -n1 --pretty=oneline $sha`" \
-			       "$id"
-			warn=$(($warn-1))
-		fi
-
-	done
-
-	if [ $warn -gt 0 ] ; then
-		printf "WARNING: Commit \"%s\" has more than one Fixes tag\n" \
-		       "`git log -n1 --pretty=oneline $sha`"
-	fi
-
-done
-
-rm -f already_picked
-rm -f already_landed
diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
index 9e9a39e494b..79b7a295ea6 100755
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -7,21 +7,107 @@
 # $ bin/get-pick-list.sh
 # $ bin/get-pick-list.sh > picklist
 # $ bin/get-pick-list.sh | tee picklist
+#
+# The output is as follows:
+# [nomination_type] commit_sha commit summary
+
+is_stable_nomination()
+{
+	git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
+}
+
+is_typod_nomination()
+{
+	git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
+}
+
+fixes=
+
+# Helper to handle various mistypos of the fixes tag.
+# The tag string itself is passed as argument and normalised within.
+#
+# Resulting string in the global variable "fixes" and contains entries
+# in the form "fixes:$sha"
+is_sha_nomination()
+{
+	fixes=`git show --pretty=medium -s $1 | tr -d "\n" | \
+		sed -e 's/'"$2"'/\nfixes:/Ig' | \
+		grep -Eo 'fixes:[a-f0-9]{8,40}'`
+
+	fixes_count=`echo "$fixes" | grep "fixes:" | wc -l`
+	if test $fixes_count -eq 0; then
+		return 1
+	fi
+
+	# Throw a warning for each invalid sha
+	while test $fixes_count -gt 0; do
+		# Treat only the current line
+		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
+		fixes_count=$(($fixes_count-1))
+		if ! git show $id &>/dev/null; then
+			echo WARNING: Commit $1 lists invalid sha $id
+		fi
+	done
+
+	return 0
+}
+
+# Checks if at least one of offending commits, listed in the global
+# "fixes", is in branch.
+sha_in_range()
+{
+	fixes_count=`echo "$fixes" | grep "fixes:" | wc -l`
+	while test $fixes_count -gt 0; do
+		# Treat only the current line
+		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
+		fixes_count=$(($fixes_count-1))
+
+		# Be that cherry-picked ...
+		# ... or landed before the branchpoint.
+		if grep -q ^$id already_picked ||
+		   grep -q ^$id already_landed ; then
+			return 0
+		fi
+	done
+	return 1
+}
+
+is_fixes_nomination()
+{
+	is_sha_nomination "$1" "fixes:[[:space:]]*"
+	if test $? -eq 0; then
+		return 0
+	fi
+	is_sha_nomination "$1" "fixes[[:space:]]\+"
+}
+
+is_brokenby_nomination()
+{
+	is_sha_nomination "$1" "broken by"
+}
+
+is_revert_nomination()
+{
+	is_sha_nomination "$1" "This reverts commit "
+}
 
 # Use the last branchpoint as our limit for the search
 latest_branchpoint=`git merge-base origin/master HEAD`
 
-# Grep for commits with "cherry picked from commit" in the commit message.
+# List all the commits between day 1 and the branch point...
+git log --reverse --pretty=%H $latest_branchpoint > already_landed
+
+# ... and the ones cherry-picked.
 git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
 	grep "cherry picked from commit" |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
 
-# Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable' $latest_branchpoint..origin/master |\
+# Grep for potential candidates
+git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
-	if [ -f bin/.cherry-ignore ] ; then
+	if test -f bin/.cherry-ignore; then
 		if grep -q ^$sha bin/.cherry-ignore ; then
 			continue
 		fi
@@ -32,7 +118,33 @@ do
 		continue
 	fi
 
-	git log -n1 --pretty=oneline $sha | cat
+	if is_fixes_nomination "$sha"; then
+		tag=fixes
+	elif is_brokenby_nomination "$sha"; then
+		tag=brokenby
+	elif is_revert_nomination "$sha"; then
+		tag=revert
+	elif is_stable_nomination "$sha"; then
+		tag=stable
+	elif is_typod_nomination "$sha"; then
+		tag=typod
+	else
+		continue
+	fi
+
+	case "$tag" in
+	fixes | brokenby | revert )
+		if ! sha_in_range; then
+			continue
+		fi
+		;;
+	* )
+		;;
+	esac
+
+	printf "[ %8s ] " "$tag"
+	git --no-pager show --summary --oneline $sha
 done
 
 rm -f already_picked
+rm -f already_landed
diff --git a/bin/get-typod-pick-list.sh b/bin/get-typod-pick-list.sh
deleted file mode 100755
index eb4181d66b8..00000000000
--- a/bin/get-typod-pick-list.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/sh
-
-# Script for generating a list of candidates which have typos in the nomination line
-#
-# Usage examples:
-#
-# $ bin/get-typod-pick-list.sh
-# $ bin/get-typod-pick-list.sh > picklist
-# $ bin/get-typod-pick-list.sh | tee picklist
-
-# NB:
-# This script intentionally _never_ checks for specific version tag
-# Should we consider folding it with the original get-pick-list.sh
-
-# Use the last branchpoint as our limit for the search
-latest_branchpoint=`git merge-base origin/master HEAD`
-
-# Grep for commits with "cherry picked from commit" in the commit message.
-git log --reverse --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
-	grep "cherry picked from commit" |\
-	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
-
-# Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-dev' $latest_branchpoint..origin/master |\
-while read sha
-do
-	# Check to see whether the patch is on the ignore list.
-	if [ -f bin/.cherry-ignore ] ; then
-		if grep -q ^$sha bin/.cherry-ignore ; then
-			continue
-		fi
-	fi
-
-	# Check to see if it has already been picked over.
-	if grep -q ^$sha already_picked ; then
-		continue
-	fi
-
-	git log -n1 --pretty=oneline $sha | cat
-done
-
-rm -f already_picked
diff --git a/configure.ac b/configure.ac
index d782f56205d..b1c6967afee 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1716,6 +1716,8 @@ xdri)
             if test x"$enable_dri" = xyes; then
                dri_modules="$dri_modules xcb-dri2 >= $XCBDRI2_REQUIRED"
             fi
+
+            dri_modules="$dri_modules xxf86vm"
         fi
         if test x"$dri_platform" = xapple ; then
             DEFINES="$DEFINES -DGLX_USE_APPLEGL"
@@ -1725,12 +1727,6 @@ xdri)
         fi
     fi
 
-    # add xf86vidmode if available
-    PKG_CHECK_MODULES([XF86VIDMODE], [xxf86vm], HAVE_XF86VIDMODE=yes, HAVE_XF86VIDMODE=no)
-    if test "$HAVE_XF86VIDMODE" = yes ; then
-        dri_modules="$dri_modules xxf86vm"
-    fi
-
     PKG_CHECK_MODULES([DRIGL], [$dri_modules])
     GL_PC_REQ_PRIV="$GL_PC_REQ_PRIV $dri_modules"
     X11_INCLUDES="$X11_INCLUDES $DRIGL_CFLAGS"
@@ -1742,10 +1738,6 @@ xdri)
     ;;
 esac
 
-# This is outside the case (above) so that it is invoked even for non-GLX
-# builds.
-AM_CONDITIONAL(HAVE_XF86VIDMODE, test "x$HAVE_XF86VIDMODE" = xyes)
-
 GLESv1_CM_LIB_DEPS="$LIBDRM_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
 GLESv1_CM_PC_LIB_PRIV="-lm $PTHREAD_LIBS $DLOPEN_LIBS"
 GLESv2_LIB_DEPS="$LIBDRM_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
@@ -1762,8 +1754,6 @@ AC_SUBST([GLESv1_CM_PC_LIB_PRIV])
 AC_SUBST([GLESv2_LIB_DEPS])
 AC_SUBST([GLESv2_PC_LIB_PRIV])
 
-AC_SUBST([HAVE_XF86VIDMODE])
-
 dnl
 dnl More GLX setup
 dnl
diff --git a/docs/releasing.html b/docs/releasing.html
index 52e102207d1..c79a020efa7 100644
--- a/docs/releasing.html
+++ b/docs/releasing.html
@@ -21,6 +21,7 @@ <h1>Releasing process</h1>
 <li><a href="#overview">Overview</a>
 <li><a href="#schedule">Release schedule</a>
 <li><a href="#pickntest">Cherry-pick and test</a>
+<li><a href="#stagingbranch">Staging branch</a>
 <li><a href="#branch">Making a branchpoint</a>
 <li><a href="#prerelease">Pre-release announcement</a>
 <li><a href="#release">Making a new release</a>
@@ -209,6 +210,25 @@ <h2>Regression/functionality testing</h2>
 idea too.
 </p>
 
+<h1 id="stagingbranch">Staging branch</h1>
+
+<p>
+A live branch, which contains the currently merge/rejected patches is available
+in the main repository under <code>staging/X.Y</code>. For example:
+</p>
+<pre>
+	staging/18.1 - WIP branch for the 18.1 series
+	staging/18.2 - WIP branch for the 18.2 series
+</pre>
+
+<p>
+Notes:
+</p>
+<ul>
+<li>People are encouraged to test the branch and report regressions.</li>
+<li>The branch history is not stable and it <strong>will</strong> be rebased,</li>
+</ul>
+
 
 <h1 id="branch">Making a branchpoint</h1>
 
diff --git a/docs/relnotes/18.3.0.html b/docs/relnotes/18.3.0.html
index 8af225a61e1..370d5e823e2 100644
--- a/docs/relnotes/18.3.0.html
+++ b/docs/relnotes/18.3.0.html
@@ -14,7 +14,7 @@ <h1>The Mesa 3D Graphics Library</h1>
 <iframe src="../contents.html"></iframe>
 <div class="content">
 
-<h1>Mesa 18.3.0 Release Notes / TBD</h1>
+<h1>Mesa 18.3.0 Release Notes / December 7, 2018</h1>
 
 <p>
 Mesa 18.3.0 is a new development release. People who are concerned
@@ -40,7 +40,8 @@ <h1>Mesa 18.3.0 Release Notes / TBD</h1>
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+17a124d4dbc712505d22a7815c9b0cee22214c96c8abb91539a2b1351e38a000  mesa-18.3.0.tar.gz
+b63f947e735d6ef3dfaa30c789a9adfbae18aea671191eaacde95a18c17fc38a  mesa-18.3.0.tar.xz
 </pre>
 
 
@@ -61,7 +62,6 @@ <h2>New features</h2>
 <li>GL_EXT_vertex_attrib_64bit on i965, nvc0, radeonsi.</li>
 <li>GL_EXT_window_rectangles on radeonsi.</li>
 <li>GL_KHR_texture_compression_astc_sliced_3d on radeonsi.</li>
-<li>GL_INTEL_fragment_shader_ordering on i965.</li>
 <li>GL_NV_fragment_shader_interlock on i965.</li>
 <li>EGL_EXT_device_base for all drivers.</li>
 <li>EGL_EXT_device_drm for all drivers.</li>
@@ -71,8 +71,206 @@ <h2>New features</h2>
 <h2>Bug fixes</h2>
 
 <ul>
-<li>TBD</li>
-</ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=13728">Bug 13728</a> - [G965] Some objects in Neverwinter Nights Linux version not displayed correctly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91433">Bug 91433</a> - piglit.spec.arb_depth_buffer_float.fbo-depth-gl_depth_component32f-copypixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93355">Bug 93355</a> - [BXT,SKLGT4e] intermittent ext_framebuffer_multisample.accuracy fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=94957">Bug 94957</a> - dEQP failures on llvmpipe</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98699">Bug 98699</a> - &quot;float[a+++4 ? 1:1] f;&quot; crashes glsl_compiler</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99507">Bug 99507</a> - Corrupted frame contents with Vulkan version of DOTA2, Talos Principle and Sascha Willems' demos when they're run Vsynched in fullscreen</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99730">Bug 99730</a> - Metro Redux game(s) needs override for midshader extension declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100200">Bug 100200</a> - Default Unreal Engine 4 frag shader fails to compile</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101247">Bug 101247</a> - Mesa fails to link GLSL programs with unused output blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102597">Bug 102597</a> - [Regression] mpv, high rendering times (two to three times higher)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103241">Bug 103241</a> - Anv crashes when using 64-bit vertex inputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104602">Bug 104602</a> - [apitrace] Graphical artifacts in Civilization VI on RX Vega</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104809">Bug 104809</a> - anv: DOOM 2016 and Wolfenstein II:The New Colossus crash due to not having depthBoundsTest</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104926">Bug 104926</a> - swrast: Mesa 17.3.3 produces:  HW cursor for format 875713089 not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105333">Bug 105333</a> - [gallium-nine] missing geometry after commit ac: replace ac_build_kill with ac_build_kill_if_false</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105371">Bug 105371</a> - r600_shader_from_tgsi - GPR limit exceeded - shader requires 360 registers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105731">Bug 105731</a> - linker error &quot;fragment shader input ... has no matching output in the previous stage&quot; when previous stage's output declaration in a separate shader object</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105904">Bug 105904</a> - Needed to delete mesa shader cache after driver upgrade for 32 bit wine vulkan programs to work.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105975">Bug 105975</a> - i965 always reports 0 viewport subpixel bits</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106231">Bug 106231</a> - llvmpipe blends produce bad code after llvm patch https://reviews.llvm.org/D44785</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106283">Bug 106283</a> - Shader replacements works only for limited use cases</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106577">Bug 106577</a> - broken rendering with nine and nouveau (GM107)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106833">Bug 106833</a> - glLinkProgram is expected to fail when vertex attribute aliasing happens on ES3.0 context or later</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106865">Bug 106865</a> - [GLK] piglit.spec.ext_framebuffer_multisample.accuracy stencil tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106980">Bug 106980</a> - Basemark GPU vulkan benchmark hangs on GFX9</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106997">Bug 106997</a> - [Regression]. Dying light game is crashing on latest mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107088">Bug 107088</a> - [GEN8+] Hang when discarding a fragment if dual source blending is enabled but shader doesn't support it</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107098">Bug 107098</a> - Segfault after munmap(kms_sw_dt-&gt;ro_mapped)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107212">Bug 107212</a> - Dual-Core CPU E5500 / G45: RetroArch with reicast core results in corrupted graphics</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107223">Bug 107223</a> - [GEN9+] 50% perf drop in SynMark Fill* tests (E2E RBC gets disabled?)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107276">Bug 107276</a> - radv: OpBitfieldUExtract returns incorrect result when count is zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107280">Bug 107280</a> - [DXVK] Batman: Arkham City with tessellation enabled hangs on SKL GT4</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107313">Bug 107313</a> - Meson instructions on web site are non-optimal</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107359">Bug 107359</a> - [Regression] [bisected] [OpenGL CTS] [SKL,BDW] KHR-GL46.texture_barrier*-texels, GTF-GL46.gtf21.GL2FixedTests.buffer_corners.buffer_corners, and GTF-GL46.gtf21.GL2FixedTests.stencil_plane_corners.stencil_plane_corners fail with some configuration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107460">Bug 107460</a> - radv: OpControlBarrier does not always work correctly (bisected)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107477">Bug 107477</a> - [DXVK] Setting high shader quality in GTA V results in LLVM error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107483">Bug 107483</a> - DispatchSanity_test.GL31_CORE regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107487">Bug 107487</a> - [intel] [tools] intel gpu tools don't honor -D tools=[]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107488">Bug 107488</a> - gl.h:2090: error: redefinition of typedef ‘GLeglImageOES’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107510">Bug 107510</a> - [GEN8+] up to 10% perf drop on several 3D benchmarks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107511">Bug 107511</a> - KHR/khrplatform.h not always installed when needed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107524">Bug 107524</a> - Broken packDouble2x32 at llvmpipe</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107544">Bug 107544</a> - intel/decoder: out of bounds group_iter</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107547">Bug 107547</a> - shader crashing glsl_compiler (uniform block assigned to vec2, then component substraced by 1)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107550">Bug 107550</a> - &quot;0[2]&quot; as function parameter hits assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107563">Bug 107563</a> - [RADV] Broken rendering in Unity demos</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107565">Bug 107565</a> - TypeError: __init__() got an unexpected keyword argument 'future_imports'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107579">Bug 107579</a> - [SNB] The graphic corruption when we reuse the GS compiled and used for TFB when statebuffer contain magic trash in the unused space</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107601">Bug 107601</a> - Rise of the Tomb Raider Segmentation Fault when the game starts</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107610">Bug 107610</a> - Dolphin emulator mis-renders shadow overlay in Super Mario Sunshine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107626">Bug 107626</a> - [SNB] The graphical corruption and GPU hang occur sometimes on the piglit test &quot;arb_texture_multisample-large-float-texture&quot; with parameter --fp16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107658">Bug 107658</a> - [Regression] [bisected] [OpenGLES CTS] KHR-GLES3.packed_pixels.*rectangle.r*8_snorm</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107734">Bug 107734</a> - [GLSL] glsl-fface-invariant, glsl-fcoord-invariant and glsl-pcoord-invariant should fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107745">Bug 107745</a> - [bisected] [bdw bsw] piglit.­spec.­arb_fragment_shader_interlock.­arb_fragment_shader_interlock-image-load-store failure</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107760">Bug 107760</a> - GPU Hang when Playing DiRT 3 Complete Edition using Steam Play with DXVK</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107765">Bug 107765</a> - [regression] Batman Arkham City crashes with DXVK under wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107772">Bug 107772</a> - Mesa preprocessor matches if(def)s &amp; endifs incorrectly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107779">Bug 107779</a> - Access violation with some games</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107786">Bug 107786</a> - [DXVK] MSAA reflections are broken in GTA V</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107806">Bug 107806</a> - glsl_get_natural_size_align_bytes() ABORT with GfxBench Vulkan AztecRuins</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107810">Bug 107810</a> - The 'va_end' call is missed after 'va_copy' in 'util_vsnprintf' function under windows</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107832">Bug 107832</a> - Gallium picking A16L16 formats when emulating INTENSITY16 conflicts with mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107843">Bug 107843</a> - 32bit Mesa build failes with meson.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107856">Bug 107856</a> - i965 incorrectly calculates the number of layers for texture views (assert)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107857">Bug 107857</a> - GPU hang - GS_EMIT without shader outputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107865">Bug 107865</a> - swr fail to build with llvm-libs 6.0.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107869">Bug 107869</a> - u_thread.h:87:4: error: use of undeclared identifier 'cpu_set_t'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107870">Bug 107870</a> - Undefined symbols for architecture x86_64: &quot;_util_cpu_caps&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107879">Bug 107879</a> - crash happens when link program</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107891">Bug 107891</a> - [wine, regression, bisected] RAGE, Wolfenstein The New Order hangs in menu</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107923">Bug 107923</a> - build_id.c:126: multiple definition of `build_id_length'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107926">Bug 107926</a> - [anv] Rise of the Tomb Raider always misrendering, segfault and gpu hang.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107941">Bug 107941</a> - GPU hang and system crash with Dota 2 using Vulkan</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107971">Bug 107971</a> - SPV_GOOGLE_hlsl_functionality1 / SPV_GOOGLE_decorate_string</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108012">Bug 108012</a> - Compiler crashes on access of non-existent member incremental operations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108024">Bug 108024</a> - [Debian Stretch]Fail to build because &quot;xcb_randr_lease_t&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108082">Bug 108082</a> - warning: unknown warning option '-Wno-format-truncation' [-Wunknown-warning-option]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108109">Bug 108109</a> - [GLSL] no-overloads.vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108112">Bug 108112</a> - [vulkancts] some of the coherent memory tests fail.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108113">Bug 108113</a> - [vulkancts] r32g32b32 transfer operations not implemented</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108115">Bug 108115</a> - [vulkancts] dEQP-VK.subgroups.vote.graphics.subgroupallequal.* fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108164">Bug 108164</a> - [radv] VM faults since 5d6a560a2986c9ab421b3c7904d29bb7bc35e36f</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108245">Bug 108245</a> - RADV/Vega: Low mip levels of large BCn textures get corrupted by vkCmdCopyBufferToImage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108272">Bug 108272</a> - [polaris10] opencl-mesa: Anything using OpenCL segfaults, XFX Radeon RX 580</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108311">Bug 108311</a> - Query buffer object support is broken on r600.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108319">Bug 108319</a> - [GLK BXT BSW] Assertion in piglit.spec.arb_gpu_shader_fp64.execution.built-in-functions.vs-sign-sat-neg-abs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108491">Bug 108491</a> - Commit baa38c14 causes output issues on my VEGA with RADV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108524">Bug 108524</a> - [RADV]  GPU lockup on event synchronization</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108530">Bug 108530</a> - (mesa-18.3) [Tracker] Mesa 18.3 Release Tracker</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108532">Bug 108532</a> - make check nir_copy_prop_vars_test.store_store_load_different_components regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108560">Bug 108560</a> - Mesa 32 is built without sse</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108595">Bug 108595</a> - ir3_compiler valgrind build error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108617">Bug 108617</a> - [deqp] Mesa fails conformance for egl_ext_device</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108630">Bug 108630</a> - [G965] piglit.spec.!opengl 1_2.tex3d-maxsize spins forever</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108635">Bug 108635</a> - Mesa master commit 68dc591af16ebb36814e4c187e4998948103c99c causes XWayland to segfault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108713">Bug 108713</a> - Gallium: use after free with transform feedback</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108829">Bug 108829</a> - [meson] libglapi exports internal API</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108894">Bug 108894</a> - [anv] vkCmdCopyBuffer() and vkCmdCopyQueryPoolResults() write-after-write hazard</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108909">Bug 108909</a> - Vkd3d test failure test_resolve_non_issued_query_data()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108914">Bug 108914</a> - blocky shadow artifacts in The Forest with DXVK, RADV_DEBUG=nohiz fixes this</li>
 
 <h2>Changes</h2>
 
diff --git a/docs/relnotes/18.3.1.html b/docs/relnotes/18.3.1.html
new file mode 100644
index 00000000000..8acbfb7a5f2
--- /dev/null
+++ b/docs/relnotes/18.3.1.html
@@ -0,0 +1,63 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.3.1 Release Notes / December 11, 2018</h1>
+
+<p>
+Mesa 18.3.1 is a bug fix release which fixes bugs found since the 18.3.0 release.
+</p>
+<p>
+Mesa 18.3.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+256d0c3d88e380c1b8e3fc5c6ac34001e3b7c30458b8b852407ec68b8ccd9fda  mesa-18.3.1.tar.gz
+5b1f827d28684a25f6657289f8b7d47ac56395988c7ac23e0ec9a62b644bdc63  mesa-18.3.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+<p>None</p>
+
+
+<h2>Changes</h2>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.3.0</li>
+  <li>Update version to 18.3.1</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>anv,radv: Disable VK_EXT_pci_bus_info</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.3.2.html b/docs/relnotes/18.3.2.html
new file mode 100644
index 00000000000..594b42cdf4e
--- /dev/null
+++ b/docs/relnotes/18.3.2.html
@@ -0,0 +1,265 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.3.2 Release Notes / January 17, 2019</h1>
+
+<p>
+Mesa 18.3.2 is a bug fix release which fixes bugs found since the 18.3.1 release.
+</p>
+<p>
+Mesa 18.3.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+1cde4fafd40cd1ad4ee3a13b364b7a0175a08b7afdd127fb46f918c1e1dfd4b0  mesa-18.3.2.tar.gz
+f7ce7181c07b6d8e0132da879af1729523a6c8aa87f79a9d59dfd064024cfb35  mesa-18.3.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106595">Bug 106595</a> - [RADV] Rendering distortions only when MSAA is enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107728">Bug 107728</a> - Wrong background in Sascha Willem's Multisampling Demo</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108114">Bug 108114</a> - [vulkancts] new VK_KHR_16bit_storage tests fail.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108116">Bug 108116</a> - [vulkancts] stencil partial clear tests fail.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108624">Bug 108624</a> - [regression][bisected] &quot;nir: Copy propagation between blocks&quot; regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108910">Bug 108910</a> - Vkd3d test failure test_multisample_array_texture()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108911">Bug 108911</a> - Vkd3d test failure test_clear_render_target_view()</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108943">Bug 108943</a> - Build fails on ppc64le with meson</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109072">Bug 109072</a> - GPU hang in blender 2.80</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109081">Bug 109081</a> - [bisected] [HSW] Regression in clipping.user_defined.clip_* vulkancts tests</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109151">Bug 109151</a> - [KBL-G][vulkan] dEQP-VK.texture.explicit_lod.2d.sizes.31x55_nearest_linear_mipmap_nearest_repeat failed verification.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109202">Bug 109202</a> - nv50_ir.cpp:749:19: error: cannot use typeid with -fno-rtti</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=109204">Bug 109204</a> - [regression, bisected] retroarch's crt-royale shader crash radv</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (3):</p>
+<ul>
+  <li>pci_ids: add new vega10 pci ids</li>
+  <li>pci_ids: add new vega20 pci id</li>
+  <li>pci_ids: add new VegaM pci id</li>
+</ul>
+
+<p>Alexander von Gluck IV (1):</p>
+<ul>
+  <li>egl/haiku: Fix reference to disp vs dpy</li>
+</ul>
+
+<p>Andres Gomez (2):</p>
+<ul>
+  <li>glsl: correct typo in GLSL compilation error message</li>
+  <li>glsl/linker: specify proper direction in location aliasing error</li>
+</ul>
+
+<p>Axel Davy (3):</p>
+<ul>
+  <li>st/nine: Fix volumetexture dtor on ctor failure</li>
+  <li>st/nine: Bind src not dst in nine_context_box_upload</li>
+  <li>st/nine: Add src reference to nine_context_range_upload</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (5):</p>
+<ul>
+  <li>radv: Do a cache flush if needed before reading predicates.</li>
+  <li>radv: Implement buffer stores with less than 4 components.</li>
+  <li>anv/android: Do not reject storage images.</li>
+  <li>radv: Fix rasterization precision bits.</li>
+  <li>spirv: Fix matrix parameters in function calls.</li>
+</ul>
+
+<p>Caio Marcelo de Oliveira Filho (3):</p>
+<ul>
+  <li>nir: properly clear the entry sources in copy_prop_vars</li>
+  <li>nir: properly find the entry to keep in copy_prop_vars</li>
+  <li>nir: remove dead code from copy_prop_vars</li>
+</ul>
+
+<p>Dave Airlie (2):</p>
+<ul>
+  <li>radv/xfb: fix counter buffer bounds checks.</li>
+  <li>virgl/vtest: fix front buffer flush with protocol version 0.</li>
+</ul>
+
+<p>Dylan Baker (6):</p>
+<ul>
+  <li>meson: Fix ppc64 little endian detection</li>
+  <li>meson: Add support for gnu hurd</li>
+  <li>meson: Add toggle for glx-direct</li>
+  <li>meson: Override C++ standard to gnu++11 when building with altivec on ppc64</li>
+  <li>meson: Error out if building nouveau and using LLVM without rtti</li>
+  <li>autotools: Remove tegra vdpau driver</li>
+</ul>
+
+<p>Emil Velikov (12):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.3.1</li>
+  <li>bin/get-pick-list.sh: rework handing of sha nominations</li>
+  <li>bin/get-pick-list.sh: warn when commit lists invalid sha</li>
+  <li>cherry-ignore: meson: libfreedreno depends upon libdrm (for fence support)</li>
+  <li>glx: mandate xf86vidmode only for "drm" dri platforms</li>
+  <li>meson: don't require glx/egl/gbm with gallium drivers</li>
+  <li>pipe-loader: meson: reference correct library</li>
+  <li>TODO: glx: meson: build dri based glx tests, only with -Dglx=dri</li>
+  <li>glx: meson: drop includes from a link-only library</li>
+  <li>glx: meson: wire up the dispatch-index-check test</li>
+  <li>glx/test: meson: assorted include fixes</li>
+  <li>Update version to 18.3.2</li>
+</ul>
+
+<p>Eric Anholt (6):</p>
+<ul>
+  <li>v3d: Fix a leak of the transfer helper on screen destroy.</li>
+  <li>vc4: Fix a leak of the transfer helper on screen destroy.</li>
+  <li>v3d: Fix a leak of the disassembled instruction string during debug dumps.</li>
+  <li>v3d: Make sure that a thrsw doesn't split a multop from its umul24.</li>
+  <li>v3d: Add missing flagging of SYNCB as a TSY op.</li>
+  <li>gallium/ttn: Fix setup of outputs_written.</li>
+</ul>
+
+<p>Erik Faye-Lund (2):</p>
+<ul>
+  <li>virgl: wrap vertex element state in a struct</li>
+  <li>virgl: work around bad assumptions in virglrenderer</li>
+</ul>
+
+<p>Francisco Jerez (5):</p>
+<ul>
+  <li>intel/fs: Handle source modifiers in lower_integer_multiplication().</li>
+  <li>intel/fs: Implement quad swizzles on ICL+.</li>
+  <li>intel/fs: Fix bug in lower_simd_width while splitting an instruction which was already split.</li>
+  <li>intel/eu/gen7: Fix brw_MOV() with DF destination and strided source.</li>
+  <li>intel/fs: Respect CHV/BXT regioning restrictions in copy propagation pass.</li>
+</ul>
+
+<p>Ian Romanick (2):</p>
+<ul>
+  <li>i965/vec4/dce: Don't narrow the write mask if the flags are used</li>
+  <li>Revert "nir/lower_indirect: Bail early if modes == 0"</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>clover: Fix build after clang r348827</li>
+</ul>
+
+<p>Jason Ekstrand (6):</p>
+<ul>
+  <li>nir/constant_folding: Fix source bit size logic</li>
+  <li>intel/blorp: Be more conservative about copying clear colors</li>
+  <li>spirv: Handle any bit size in vector_insert/extract</li>
+  <li>anv/apply_pipeline_layout: Set the cursor in lower_res_reindex_intrinsic</li>
+  <li>spirv: Sign-extend array indices</li>
+  <li>intel/peephole_ffma: Fix swizzle propagation</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nv50/ir: fix use-after-free in ConstantFolding::visit</li>
+</ul>
+
+<p>Kirill Burtsev (1):</p>
+<ul>
+  <li>loader: free error state, when checking the drawable type</li>
+</ul>
+
+<p>Lionel Landwerlin (5):</p>
+<ul>
+  <li>anv: don't do partial resolve on layer &gt; 0</li>
+  <li>i965: include draw_params/derived_draw_params for VF cache workaround</li>
+  <li>i965: add CS stall on VF invalidation workaround</li>
+  <li>anv: explictly specify format for blorp ccs/mcs op</li>
+  <li>anv: flush fast clear colors into compressed surfaces</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>st/mesa: don't leak pipe_surface if pipe_context is not current</li>
+</ul>
+
+<p>Mario Kleiner (1):</p>
+<ul>
+  <li>radeonsi: Fix use of 1- or 2- component GL_DOUBLE vbo's.</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>meson: link LLVM 'native' component when LLVM is available</li>
+</ul>
+
+<p>Rhys Perry (3):</p>
+<ul>
+  <li>radv: don't set surf_index for stencil-only images</li>
+  <li>ac/nir,radv,radeonsi/nir: use correct indices for interpolation intrinsics</li>
+  <li>ac: split 16-bit ssbo loads that may not be dword aligned</li>
+</ul>
+
+<p>Rob Clark (2):</p>
+<ul>
+  <li>freedreno/drm: fix memory leak</li>
+  <li>mesa/st/nir: fix missing nir_compact_varyings</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: switch on EOP when primitive restart is enabled with triangle strips</li>
+</ul>
+
+<p>Timothy Arceri (2):</p>
+<ul>
+  <li>tgsi/scan: fix loop exit point in tgsi_scan_tess_ctrl()</li>
+  <li>tgsi/scan: correctly walk instructions in tgsi_scan_tess_ctrl()</li>
+</ul>
+
+<p>Vinson Lee (2):</p>
+<ul>
+  <li>meson: Fix typo.</li>
+  <li>meson: Fix libsensors detection.</li>
+</ul>
+
+
+
+</div>
+</body>
+</html>
diff --git a/docs/submittingpatches.html b/docs/submittingpatches.html
index e5350bdb2cf..d7ea0a310db 100644
--- a/docs/submittingpatches.html
+++ b/docs/submittingpatches.html
@@ -251,6 +251,9 @@ <h2 id="nominations">Nominating a commit for a stable branch</h2>
 nomination request.
 </p>
 
+<p>
+The current patch status can be observed in the <a href="releasing.html#stagingbranch">staging branch</a>.
+</p>
 
 <h3 id="thetag">The stable tag</h3>
 
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index 6f9c2c8b8cf..48060ac8de6 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1334,6 +1334,10 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_FOURCC_YVU422	0x36315659
 #define __DRI_IMAGE_FOURCC_YVU444	0x34325659
 
+#define __DRI_IMAGE_FOURCC_P010     0x30313050
+#define __DRI_IMAGE_FOURCC_P012     0x32313050
+#define __DRI_IMAGE_FOURCC_P016     0x36313050
+
 /**
  * Queryable on images created by createImageFromNames.
  *
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index 35ea3559b02..75ac7761bb4 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -219,6 +219,7 @@ CHIPSET(0x699F, POLARIS12)
 
 CHIPSET(0x694C, VEGAM)
 CHIPSET(0x694E, VEGAM)
+CHIPSET(0x694F, VEGAM)
 
 CHIPSET(0x6860, VEGA10)
 CHIPSET(0x6861, VEGA10)
@@ -227,8 +228,14 @@ CHIPSET(0x6863, VEGA10)
 CHIPSET(0x6864, VEGA10)
 CHIPSET(0x6867, VEGA10)
 CHIPSET(0x6868, VEGA10)
-CHIPSET(0x687F, VEGA10)
+CHIPSET(0x6869, VEGA10)
+CHIPSET(0x686A, VEGA10)
+CHIPSET(0x686B, VEGA10)
 CHIPSET(0x686C, VEGA10)
+CHIPSET(0x686D, VEGA10)
+CHIPSET(0x686E, VEGA10)
+CHIPSET(0x686F, VEGA10)
+CHIPSET(0x687F, VEGA10)
 
 CHIPSET(0x69A0, VEGA12)
 CHIPSET(0x69A1, VEGA12)
@@ -240,6 +247,7 @@ CHIPSET(0x66A0, VEGA20)
 CHIPSET(0x66A1, VEGA20)
 CHIPSET(0x66A2, VEGA20)
 CHIPSET(0x66A3, VEGA20)
+CHIPSET(0x66A4, VEGA20)
 CHIPSET(0x66A7, VEGA20)
 CHIPSET(0x66AF, VEGA20)
 
diff --git a/meson.build b/meson.build
index 18667988bac..5a20e1ea30d 100644
--- a/meson.build
+++ b/meson.build
@@ -54,6 +54,7 @@ with_valgrind = get_option('valgrind')
 with_libunwind = get_option('libunwind')
 with_asm = get_option('asm')
 with_glx_read_only_text = get_option('glx-read-only-text')
+with_glx_direct = get_option('glx-direct')
 with_osmesa = get_option('osmesa')
 with_swr_arches = get_option('swr-arches')
 with_tools = get_option('tools')
@@ -223,8 +224,6 @@ elif system_has_kms_drm
 else
   # FIXME: haiku doesn't use dri, and xlib doesn't use dri, probably should
   # assert here that one of those cases has been met.
-  # FIXME: GNU (hurd) ends up here as well, but meson doesn't officially
-  # support Hurd at time of writing (2017/11)
   # FIXME: illumos ends up here as well
   with_dri_platform = 'none'
 endif
@@ -370,9 +369,6 @@ if with_glvnd
   endif
 endif
 
-# TODO: toggle for this
-with_glx_direct = true
-
 if with_vulkan_icd_dir == ''
   with_vulkan_icd_dir = join_paths(get_option('datadir'), 'vulkan/icd.d')
 endif
@@ -388,9 +384,9 @@ endif
 if with_any_vk and (with_platform_x11 and not with_dri3)
   error('Vulkan drivers require dri3 for X11 support')
 endif
-if with_dri or with_gallium
-  if with_glx == 'disabled' and not with_egl and not with_platform_haiku
-    error('building dri or gallium drivers require at least one window system')
+if with_dri
+  if with_glx == 'disabled' and not with_egl and not with_gbm
+    error('building dri drivers require at least one windowing system')
   endif
 endif
 
@@ -620,7 +616,7 @@ if with_gallium_st_nine
     error('The nine state tracker requires gallium softpipe/llvmpipe.')
   elif not (with_gallium_radeonsi or with_gallium_nouveau or with_gallium_r600
             or with_gallium_r300 or with_gallium_svga or with_gallium_i915)
-    error('The nine state tracker requires at least on non-swrast gallium driver.')
+    error('The nine state tracker requires at least one non-swrast gallium driver.')
   endif
   if not with_dri3
     error('Using nine with wine requires dri3')
@@ -628,7 +624,12 @@ if with_gallium_st_nine
 endif
 
 if get_option('power8') != 'false'
-  if host_machine.cpu_family() == 'ppc64le'
+  # on old versions of meson the cpu family would return as ppc64le on little
+  # endian power8, this was changed in 0.48 such that the family would always
+  # be ppc64 regardless of endianness, and the the machine.endian() value
+  # should be checked. Since we support versions < 0.48 we need to use
+  # startswith.
+  if host_machine.cpu_family().startswith('ppc64') and host_machine.endian() == 'little'
     if cc.get_id() == 'gcc' and cc.version().version_compare('< 4.8')
       error('Altivec is not supported with gcc version < 4.8.')
     endif
@@ -650,6 +651,7 @@ if get_option('power8') != 'false'
 endif
 
 _opencl = get_option('gallium-opencl')
+clover_cpp_std = []
 if _opencl != 'disabled'
   if not with_gallium
     error('OpenCL Clover implementation requires at least one gallium driver.')
@@ -658,6 +660,14 @@ if _opencl != 'disabled'
   dep_clc = dependency('libclc')
   with_gallium_opencl = true
   with_opencl_icd = _opencl == 'icd'
+
+  if host_machine.cpu_family().startswith('ppc') and cpp.compiles('''
+      #if !defined(__VEC__) || !defined(__ALTIVEC__)
+      #error "AltiVec not enabled"
+      #endif''',
+      name : 'Altivec')
+    clover_cpp_std += ['cpp_std=gnu++11']
+  endif
 else
   dep_clc = null_dep
   with_gallium_opencl = false
@@ -781,13 +791,13 @@ if cc.compiles('int foo(void) __attribute__((__noreturn__));',
 endif
 
 # TODO: this is very incomplete
-if ['linux', 'cygwin'].contains(host_machine.system())
+if ['linux', 'cygwin', 'gnu'].contains(host_machine.system())
   pre_args += '-D_GNU_SOURCE'
 endif
 
 # Check for generic C arguments
 c_args = []
-foreach a : ['-Wall', '-Werror=implicit-function-declaration',
+foreach a : ['-Werror=implicit-function-declaration',
              '-Werror=missing-prototypes', '-Werror=return-type',
              '-fno-math-errno',
              '-fno-trapping-math', '-Qunused-arguments']
@@ -809,7 +819,7 @@ endif
 
 # Check for generic C++ arguments
 cpp_args = []
-foreach a : ['-Wall', '-Werror=return-type',
+foreach a : ['-Werror=return-type',
              '-fno-math-errno', '-fno-trapping-math',
              '-Qunused-arguments']
   if cpp.has_argument(a)
@@ -905,8 +915,9 @@ if not cc.links('''#include <stdint.h>
                    int main() {
                      return __sync_add_and_fetch(&v, (uint64_t)1);
                    }''',
+                dependencies : dep_atomic,
                 name : 'GCC 64bit atomics')
-  pre_args += '-DMISSING_64_BIT_ATOMICS'
+  pre_args += '-DMISSING_64BIT_ATOMICS'
 endif
 
 # TODO: shared/static? Is this even worth doing?
@@ -939,7 +950,7 @@ endif
 with_asm_arch = ''
 if with_asm
   if host_machine.cpu_family() == 'x86'
-    if system_has_kms_drm
+    if system_has_kms_drm or host_machine.system() == 'gnu'
       with_asm_arch = 'x86'
       pre_args += ['-DUSE_X86_ASM', '-DUSE_MMX_ASM', '-DUSE_3DNOW_ASM',
                    '-DUSE_SSE_ASM']
@@ -968,7 +979,7 @@ if with_asm
       with_asm_arch = 'sparc'
       pre_args += ['-DUSE_SPARC_ASM']
     endif
-  elif host_machine.cpu_family() == 'ppc64le'
+  elif host_machine.cpu_family().startswith('ppc64') and host_machine.endian() == 'little'
     if system_has_kms_drm
       with_asm_arch = 'ppc64le'
       pre_args += ['-DUSE_PPC64LE_ASM']
@@ -1162,7 +1173,7 @@ endif
 llvm_modules = ['bitwriter', 'engine', 'mcdisassembler', 'mcjit']
 llvm_optional_modules = []
 if with_amd_vk or with_gallium_radeonsi or with_gallium_r600
-  llvm_modules += ['amdgpu', 'bitreader', 'ipo']
+  llvm_modules += ['amdgpu', 'native', 'bitreader', 'ipo']
   if with_gallium_r600
     llvm_modules += 'asmparser'
   endif
@@ -1223,6 +1234,9 @@ if with_llvm
   # programs, so we need to build all C++ code in mesa without rtti as well to
   # ensure that linking works.
   if dep_llvm.get_configtool_variable('has-rtti') == 'NO'
+    if with_gallium_nouveau
+      error('The Nouveau driver requires rtti. You either need to turn off nouveau or use an LLVM built with LLVM_ENABLE_RTTI.')
+    endif
     cpp_args += '-fno-rtti'
   endif
 elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr
@@ -1317,13 +1331,6 @@ if with_platform_wayland
     'linux-dmabuf', 'linux-dmabuf-unstable-v1.xml'
   )
   pre_args += ['-DHAVE_WAYLAND_PLATFORM', '-DWL_HIDE_DEPRECATED']
-else
-  prog_wl_scanner = []
-  wl_scanner_arg = ''
-  dep_wl_protocols = null_dep
-  dep_wayland_client = null_dep
-  dep_wayland_server = null_dep
-  wayland_dmabuf_xml = ''
 endif
 
 dep_x11 = null_dep
@@ -1356,7 +1363,6 @@ if with_platform_x11
     dep_xdamage = dependency('xdamage', version : '>= 1.1')
     dep_xfixes = dependency('xfixes')
     dep_xcb_glx = dependency('xcb-glx', version : '>= 1.8.1')
-    dep_xxf86vm = dependency('xxf86vm', required : false)
   endif
   if (with_any_vk or with_glx == 'dri' or
        (with_gallium_vdpau or with_gallium_xvmc or with_gallium_va or
@@ -1383,6 +1389,7 @@ if with_platform_x11
   if with_glx == 'dri'
     if with_dri_platform == 'drm'
       dep_dri2proto = dependency('dri2proto', version : '>= 2.8')
+      dep_xxf86vm = dependency('xxf86vm')
     endif
     dep_glproto = dependency('glproto', version : '>= 1.4.14')
   endif
@@ -1403,7 +1410,7 @@ endif
 
 _sensors = get_option('lmsensors')
 if _sensors != 'false'
-  dep_lmsensors = cc.find_library('libsensors', required : _sensors == 'true')
+  dep_lmsensors = cc.find_library('sensors', required : _sensors == 'true')
   if dep_lmsensors.found()
     pre_args += '-DHAVE_LIBSENSORS=1'
   endif
@@ -1433,14 +1440,12 @@ elif with_glx == 'dri'
     'xcb-glx >= 1.8.1']
   if with_dri_platform == 'drm'
     gl_priv_reqs += 'xcb-dri2 >= 1.8'
+    gl_priv_reqs += 'xxf86vm'
   endif
 endif
 if dep_libdrm.found()
   gl_priv_reqs += 'libdrm >= 2.4.75'
 endif
-if dep_xxf86vm.found()
-  gl_priv_reqs += 'xxf86vm'
-endif
 
 gl_priv_libs = []
 if dep_thread.found()
diff --git a/meson_options.txt b/meson_options.txt
index a1d5ab0e185..589d10bb3f3 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -318,3 +318,9 @@ option(
   choices : ['auto', 'true', 'false'],
   description : 'Enable VK_EXT_acquire_xlib_display.'
 )
+option(
+  'glx-direct',
+  type : 'boolean',
+  value : true,
+  description : 'Enable direct rendering in GLX and EGL for DRI',
+)
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 1392ec0f238..8953da7f18d 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2882,9 +2882,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
 	if (count == num_components)
 		return value;
 
-	LLVMValueRef masks[] = {
-	    ctx->i32_0, ctx->i32_1,
-	    LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+	LLVMValueRef masks[MAX2(count, 2)];
+	masks[0] = ctx->i32_0;
+	masks[1] = ctx->i32_1;
+	for (unsigned i = 2; i < count; i++)
+		masks[i] = LLVMConstInt(ctx->i32, i, false);
 
 	if (count == 1)
 		return LLVMBuildExtractElement(ctx->builder, value, masks[0],
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index e5fbe003f53..827cb5d85a8 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -311,9 +311,18 @@ static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx,
 }
 
 static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
-			     LLVMValueRef src0)
+			     LLVMValueRef src0,
+			     unsigned bitsize)
 {
-	return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), "");
+	LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0,
+					   LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""),
+					   "");
+	result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, "");
+
+	if (bitsize == 32)
+		return result;
+
+	return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
 }
 
 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
@@ -932,7 +941,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
 		result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
 		break;
 	case nir_op_b2f:
-		result = emit_b2f(&ctx->ac, src[0]);
+		result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
 		break;
 	case nir_op_f2b:
 		result = emit_f2b(&ctx->ac, src[0]);
@@ -1613,37 +1622,45 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
                                       const nir_intrinsic_instr *instr)
 {
-	LLVMValueRef results[2];
-	int load_bytes;
 	int elem_size_bytes = instr->dest.ssa.bit_size / 8;
 	int num_components = instr->num_components;
-	int num_bytes = num_components * elem_size_bytes;
 	enum gl_access_qualifier access = nir_intrinsic_access(instr);
 	LLVMValueRef glc = ctx->ac.i1false;
 
 	if (access & (ACCESS_VOLATILE | ACCESS_COHERENT))
 		glc = ctx->ac.i1true;
 
-	for (int i = 0; i < num_bytes; i += load_bytes) {
-		load_bytes = MIN2(num_bytes - i, 16);
-		const char *load_name;
-		LLVMTypeRef data_type;
-		LLVMValueRef offset = get_src(ctx, instr->src[1]);
-		LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false);
-		LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-							get_src(ctx, instr->src[0]), false);
-		LLVMValueRef vindex = ctx->ac.i32_0;
+	LLVMValueRef offset = get_src(ctx, instr->src[1]);
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+						get_src(ctx, instr->src[0]), false);
+	LLVMValueRef vindex = ctx->ac.i32_0;
+
+	LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
+	LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type;
 
-		int idx = i ? 1 : 0;
+	LLVMValueRef results[4];
+	for (int i = 0; i < num_components;) {
+		int num_elems = num_components - i;
+		if (elem_size_bytes < 4)
+			num_elems = 1;
+		if (num_elems * elem_size_bytes > 16)
+			num_elems = 16 / elem_size_bytes;
+		int load_bytes = num_elems * elem_size_bytes;
+
+		LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false);
+
+		LLVMValueRef ret;
 		if (load_bytes == 2) {
-			results[idx] = ac_build_tbuffer_load_short(&ctx->ac,
-								   rsrc,
-								   vindex,
-								   offset,
-								   ctx->ac.i32_0,
-								   immoffset,
-								   glc);
+			ret = ac_build_tbuffer_load_short(&ctx->ac,
+							  rsrc,
+							  vindex,
+							  offset,
+							  ctx->ac.i32_0,
+							  immoffset,
+							  glc);
 		} else {
+			const char *load_name;
+			LLVMTypeRef data_type;
 			switch (load_bytes) {
 			case 16:
 			case 12:
@@ -1669,33 +1686,23 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
 				glc,
 				ctx->ac.i1false,
 			};
-			results[idx] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
-			unsigned num_elems = ac_get_type_size(data_type) / elem_size_bytes;
-			LLVMTypeRef resTy = LLVMVectorType(LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size), num_elems);
-			results[idx] = LLVMBuildBitCast(ctx->ac.builder, results[idx], resTy, "");
+			ret = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
 		}
-	}
 
-	assume(results[0]);
-	LLVMValueRef ret = results[0];
-	if (num_bytes > 16 || num_components == 3) {
-		LLVMValueRef masks[] = {
-		        LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
-		        LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
-		};
+		LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
+		ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, "");
+		ret = ac_trim_vector(&ctx->ac, ret, load_bytes);
 
-		if (num_bytes > 16 && num_components == 3) {
-			/* we end up with a v2i64 and i64 but shuffle fails on that */
-			results[1] = ac_build_expand(&ctx->ac, results[1], 1, 2);
-		}
+		LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems);
+		ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, "");
 
-		LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
-		ret = LLVMBuildShuffleVector(ctx->ac.builder, results[0],
-					     results[num_bytes > 16 ? 1 : 0], swizzle, "");
+		for (unsigned j = 0; j < num_elems; j++) {
+			results[i + j] = LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), "");
+		}
+		i += num_elems;
 	}
 
-	return LLVMBuildBitCast(ctx->ac.builder, ret,
-	                        get_def_type(ctx, &instr->dest.ssa), "");
+	return ac_build_gather_values(&ctx->ac, results, num_components);
 }
 
 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
@@ -2371,17 +2378,27 @@ static void visit_image_store(struct ac_nir_context *ctx,
 		glc = ctx->ac.i1true;
 
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
+		char name[48];
+		const char *types[] = { "f32", "v2f32", "v4f32" };
 		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true);
+		LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
+		unsigned src_channels = ac_get_llvm_num_components(src);
 
-		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); /* data */
+		if (src_channels == 3)
+			src = ac_build_expand(&ctx->ac, src, 3, 4);
+
+		params[0] = src; /* data */
 		params[1] = rsrc;
 		params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 						    ctx->ac.i32_0, ""); /* vindex */
 		params[3] = ctx->ac.i32_0; /* voffset */
+		snprintf(name, sizeof(name), "%s.%s",
+		                            "llvm.amdgcn.buffer.store.format",
+		         types[CLAMP(src_channels, 1, 3) - 1]);
+
 		params[4] = glc;  /* glc */
 		params[5] = ctx->ac.i1false;  /* slc */
-		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->ac.voidt,
-				   params, 6, 0);
+		ac_build_intrinsic(&ctx->ac, name, ctx->ac.voidt, params, 6, 0);
 	} else {
 		struct ac_image_args args = {};
 		args.opcode = ac_image_store;
@@ -2793,7 +2810,7 @@ static LLVMValueRef visit_interp(struct ac_nir_context *ctx,
 	LLVMValueRef src0 = NULL;
 
 	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-	int input_index = var->data.location - VARYING_SLOT_VAR0;
+	int input_index = ctx->abi->fs_input_attr_indices[var->data.location - VARYING_SLOT_VAR0];
 	switch (instr->intrinsic) {
 	case nir_intrinsic_interp_deref_at_centroid:
 		location = INTERP_CENTROID;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 6b9a91c92a9..ee18e6c1923 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -77,6 +77,9 @@ struct ac_shader_abi {
 	 */
 	LLVMValueRef *inputs;
 
+	/* Varying -> attribute number mapping. Also NIR-only */
+	unsigned fs_input_attr_indices[MAX_VARYING];
+
 	void (*emit_outputs)(struct ac_shader_abi *abi,
 			     unsigned max_outputs,
 			     LLVMValueRef *addrs);
diff --git a/src/amd/vulkan/Android.mk b/src/amd/vulkan/Android.mk
index 51b03561fa7..9574bf54e5a 100644
--- a/src/amd/vulkan/Android.mk
+++ b/src/amd/vulkan/Android.mk
@@ -74,7 +74,8 @@ LOCAL_C_INCLUDES := \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_vulkan_util,,)/util
 
 LOCAL_WHOLE_STATIC_LIBRARIES := \
-	libmesa_vulkan_util
+	libmesa_vulkan_util \
+	libmesa_git_sha1
 
 LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.c
 LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.h
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 0f1261d4809..cc2aa7fd17a 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -140,7 +140,7 @@ libvulkan_radeon = shared_library(
   ],
   dependencies : [
     dep_llvm, dep_libdrm_amdgpu, dep_thread, dep_elf, dep_dl, dep_m,
-    dep_valgrind,
+    dep_valgrind, radv_deps,
     idep_nir,
   ],
   c_args : [c_vis_args, no_override_init_args, radv_flags],
diff --git a/src/amd/vulkan/radv_android.c b/src/amd/vulkan/radv_android.c
index f5d70825dd2..1a4425f26a5 100644
--- a/src/amd/vulkan/radv_android.c
+++ b/src/amd/vulkan/radv_android.c
@@ -110,17 +110,6 @@ radv_image_from_gralloc(VkDevice device_h,
 	struct radv_bo *bo = NULL;
 	VkResult result;
 
-	result = radv_image_create(device_h,
-	                           &(struct radv_image_create_info) {
-	                               .vk_info = base_info,
-	                               .scanout = true,
-	                               .no_metadata_planes = true},
-	                           alloc,
-	                           &image_h);
-
-	if (result != VK_SUCCESS)
-		return result;
-
 	if (gralloc_info->handle->numFds != 1) {
 		return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
 		                 "VkNativeBufferANDROID::handle::numFds is %d, "
@@ -133,23 +122,14 @@ radv_image_from_gralloc(VkDevice device_h,
 	 */
 	int dma_buf = gralloc_info->handle->data[0];
 
-	image = radv_image_from_handle(image_h);
-
 	VkDeviceMemory memory_h;
 
-	const VkMemoryDedicatedAllocateInfoKHR ded_alloc = {
-		.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
-		.pNext = NULL,
-		.buffer = VK_NULL_HANDLE,
-		.image = image_h
-	};
-
 	const VkImportMemoryFdInfoKHR import_info = {
 		.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
-		.pNext = &ded_alloc,
 		.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
 		.fd = dup(dma_buf),
 	};
+
 	/* Find the first VRAM memory type, or GART for PRIME images. */
 	int memory_type_index = -1;
 	for (int i = 0; i < device->physical_device->memory_properties.memoryTypeCount; ++i) {
@@ -168,14 +148,49 @@ radv_image_from_gralloc(VkDevice device_h,
 				     &(VkMemoryAllocateInfo) {
 					     .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
 					     .pNext = &import_info,
-					     .allocationSize = image->size,
+					     /* Max buffer size, unused for imports */
+					     .allocationSize = 0x7FFFFFFF,
 					     .memoryTypeIndex = memory_type_index,
 				     },
 				     alloc,
 				     &memory_h);
+	if (result != VK_SUCCESS)
+		return result;
+
+	struct radeon_bo_metadata md;
+	device->ws->buffer_get_metadata(radv_device_memory_from_handle(memory_h)->bo, &md);
+
+	bool is_scanout;
+	if (device->physical_device->rad_info.chip_class >= GFX9) {
+		/* Copied from radeonsi, but is hacky so should be cleaned up. */
+		is_scanout =  md.u.gfx9.swizzle_mode == 0 || md.u.gfx9.swizzle_mode % 4 == 2;
+	} else {
+		is_scanout = md.u.legacy.scanout;
+	}
+
+	VkImageCreateInfo updated_base_info = *base_info;
+
+	VkExternalMemoryImageCreateInfo external_memory_info = {
+		.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
+		.pNext = updated_base_info.pNext,
+		.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+	};
+
+	updated_base_info.pNext = &external_memory_info;
+
+	result = radv_image_create(device_h,
+	                           &(struct radv_image_create_info) {
+	                               .vk_info = &updated_base_info,
+	                               .scanout = is_scanout,
+	                               .no_metadata_planes = true},
+	                           alloc,
+	                           &image_h);
+
 	if (result != VK_SUCCESS)
 		goto fail_create_image;
 
+	image = radv_image_from_handle(image_h);
+
 	radv_BindImageMemory(device_h, image_h, memory_h, 0);
 
 	image->owned_memory = memory_h;
@@ -185,9 +200,7 @@ radv_image_from_gralloc(VkDevice device_h,
 	return VK_SUCCESS;
 
 fail_create_image:
-fail_size:
-	radv_DestroyImage(device_h, image_h, alloc);
-
+	radv_FreeMemory(device_h, memory_h, alloc);
 	return result;
 }
 
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index c43e12f6d62..4ebb01c6810 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1068,7 +1068,7 @@ static void
 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
 			     struct radv_ds_buffer_info *ds,
 			     struct radv_image *image, VkImageLayout layout,
-			     bool requires_cond_write)
+			     bool requires_cond_exec)
 {
 	uint32_t db_z_info = ds->db_z_info;
 	uint32_t db_z_info_reg;
@@ -1092,38 +1092,21 @@ radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer,
 	}
 
 	/* When we don't know the last fast clear value we need to emit a
-	 * conditional packet, otherwise we can update DB_Z_INFO directly.
+	 * conditional packet that will eventually skip the following
+	 * SET_CONTEXT_REG packet.
 	 */
-	if (requires_cond_write) {
-		radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_WRITE, 7, 0));
-
-		const uint32_t write_space = 0 << 8;	/* register */
-		const uint32_t poll_space = 1 << 4;	/* memory */
-		const uint32_t function = 3 << 0;	/* equal to the reference */
-		const uint32_t options = write_space | poll_space | function;
-		radeon_emit(cmd_buffer->cs, options);
-
-		/* poll address - location of the depth clear value */
+	if (requires_cond_exec) {
 		uint64_t va = radv_buffer_get_va(image->bo);
-		va += image->offset + image->clear_value_offset;
-
-		/* In presence of stencil format, we have to adjust the base
-		 * address because the first value is the stencil clear value.
-		 */
-		if (vk_format_is_stencil(image->vk_format))
-			va += 4;
+		va += image->offset + image->tc_compat_zrange_offset;
 
+		radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
 		radeon_emit(cmd_buffer->cs, va);
 		radeon_emit(cmd_buffer->cs, va >> 32);
-
-		radeon_emit(cmd_buffer->cs, fui(0.0f));		 /* reference value */
-		radeon_emit(cmd_buffer->cs, (uint32_t)-1);	 /* comparison mask */
-		radeon_emit(cmd_buffer->cs, db_z_info_reg >> 2); /* write address low */
-		radeon_emit(cmd_buffer->cs, 0u);		 /* write address high */
-		radeon_emit(cmd_buffer->cs, db_z_info);
-	} else {
-		radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
+		radeon_emit(cmd_buffer->cs, 0);
+		radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
 	}
+
+	radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
 }
 
 static void
@@ -1270,6 +1253,45 @@ radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 		radeon_emit(cs, fui(ds_clear_value.depth));
 }
 
+/**
+ * Update the TC-compat metadata value for this image.
+ */
+static void
+radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
+				   struct radv_image *image,
+				   uint32_t value)
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	uint64_t va = radv_buffer_get_va(image->bo);
+	va += image->offset + image->tc_compat_zrange_offset;
+
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, value);
+}
+
+static void
+radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
+				      struct radv_image *image,
+				      VkClearDepthStencilValue ds_clear_value)
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	uint64_t va = radv_buffer_get_va(image->bo);
+	va += image->offset + image->tc_compat_zrange_offset;
+	uint32_t cond_val;
+
+	/* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
+	 * depth clear value is 0.0f.
+	 */
+	cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
+
+	radv_set_tc_compat_zrange_metadata(cmd_buffer, image, cond_val);
+}
+
 /**
  * Update the clear depth/stencil values for this image.
  */
@@ -1283,6 +1305,12 @@ radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 
 	radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects);
 
+	if (radv_image_is_tc_compat_htile(image) &&
+	    (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
+		radv_update_tc_compat_zrange_metadata(cmd_buffer, image,
+						      ds_clear_value);
+	}
+
 	radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value,
 				        aspects);
 }
@@ -1950,6 +1978,8 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
 
 			va = radv_buffer_get_va(buffer->bo) + buffer->offset;
 
+			va += sb[i].offset;
+
 			/* Set the descriptor.
 			 *
 			 * On VI, the format must be non-INVALID, otherwise
@@ -3518,8 +3548,13 @@ static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
 
 	uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
 
-	/* Index & Vertex buffer don't change context regs, and pipeline is handled later. */
-	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_PIPELINE);
+	/* Index, vertex and streamout buffers don't change context regs, and
+	 * pipeline is handled later.
+	 */
+	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
+			 RADV_CMD_DIRTY_VERTEX_BUFFER |
+			 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
+			 RADV_CMD_DIRTY_PIPELINE);
 
 	/* Assume all state changes except  these two can imply context rolls. */
 	if (cmd_buffer->state.dirty & used_states)
@@ -4185,6 +4220,15 @@ static void radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer,
 		aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
 
 	radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects);
+
+	if (radv_image_is_tc_compat_htile(image)) {
+		/* Initialize the TC-compat metada value to 0 because by
+		 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
+		 * need have to conditionally update its value when performing
+		 * a fast depth clear.
+		 */
+		radv_set_tc_compat_zrange_metadata(cmd_buffer, image, 0);
+	}
 }
 
 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
@@ -4613,6 +4657,8 @@ void radv_CmdBeginConditionalRenderingEXT(
 		draw_visible = false;
 	}
 
+	si_emit_cache_flush(cmd_buffer);
+
 	/* Enable predication for this command buffer. */
 	si_emit_set_predication_state(cmd_buffer, draw_visible, va);
 	cmd_buffer->state.predicating = true;
@@ -4741,28 +4787,30 @@ void radv_CmdBeginTransformFeedbackEXT(
 	struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	uint32_t i;
 
 	radv_flush_vgt_streamout(cmd_buffer);
 
 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
-	for (uint32_t i = firstCounterBuffer; i < counterBufferCount; i++) {
-		if (!(so->enabled_mask & (1 << i)))
-			continue;
+	for_each_bit(i, so->enabled_mask) {
+		int32_t counter_buffer_idx = i - firstCounterBuffer;
+		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
+			counter_buffer_idx = -1;
 
 		/* SI binds streamout buffers as shader resources.
 		 * VGT only counts primitives and tells the shader through
 		 * SGPRs what to do.
 		 */
 		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
-		radeon_emit(cs, (sb[i].offset + sb[i].size) >> 2);	/* BUFFER_SIZE (in DW) */
+		radeon_emit(cs, sb[i].size >> 2);	/* BUFFER_SIZE (in DW) */
 		radeon_emit(cs, so->stride_in_dw[i]);			/* VTX_STRIDE (in DW) */
 
-		if (pCounterBuffers && pCounterBuffers[i]) {
+		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
 			/* The array of counter buffers is optional. */
-			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]);
+			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
 			uint64_t va = radv_buffer_get_va(buffer->bo);
 
-			va += buffer->offset + pCounterBufferOffsets[i];
+			va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
 
 			/* Append */
 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
@@ -4783,7 +4831,7 @@ void radv_CmdBeginTransformFeedbackEXT(
 					STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
 			radeon_emit(cs, 0); /* unused */
 			radeon_emit(cs, 0); /* unused */
-			radeon_emit(cs, sb[i].offset >> 2); /* buffer offset in DW */
+			radeon_emit(cs, 0); /* unused */
 			radeon_emit(cs, 0); /* unused */
 		}
 	}
@@ -4801,20 +4849,22 @@ void radv_CmdEndTransformFeedbackEXT(
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	struct radv_streamout_state *so = &cmd_buffer->state.streamout;
 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	uint32_t i;
 
 	radv_flush_vgt_streamout(cmd_buffer);
 
 	assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
-	for (uint32_t i = firstCounterBuffer; i < counterBufferCount; i++) {
-		if (!(so->enabled_mask & (1 << i)))
-			continue;
+	for_each_bit(i, so->enabled_mask) {
+		int32_t counter_buffer_idx = i - firstCounterBuffer;
+		if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
+			counter_buffer_idx = -1;
 
-		if (pCounterBuffers && pCounterBuffers[i]) {
+		if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
 			/* The array of counters buffer is optional. */
-			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[i]);
+			RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
 			uint64_t va = radv_buffer_get_va(buffer->bo);
 
-			va += buffer->offset + pCounterBufferOffsets[i];
+			va += buffer->offset + pCounterBufferOffsets[counter_buffer_idx];
 
 			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
 			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index d68111c25bf..ac6cff23d58 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -936,9 +936,9 @@ void radv_GetPhysicalDeviceProperties(
 			2048,
 			2048
 		},
-		.subPixelPrecisionBits                    = 4 /* FIXME */,
-		.subTexelPrecisionBits                    = 4 /* FIXME */,
-		.mipmapPrecisionBits                      = 4 /* FIXME */,
+		.subPixelPrecisionBits                    = 8,
+		.subTexelPrecisionBits                    = 8,
+		.mipmapPrecisionBits                      = 8,
 		.maxDrawIndexedIndexValue                 = UINT32_MAX,
 		.maxDrawIndirectCount                     = UINT32_MAX,
 		.maxSamplerLodBias                        = 16,
@@ -1054,16 +1054,14 @@ void radv_GetPhysicalDeviceProperties2(
 			    (VkPhysicalDeviceSubgroupProperties*)ext;
 			properties->subgroupSize = 64;
 			properties->supportedStages = VK_SHADER_STAGE_ALL;
-			/* TODO: Enable VK_SUBGROUP_FEATURE_VOTE_BIT when wwm
-			 * is fixed in LLVM.
-			 */
 			properties->supportedOperations =
-							VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
 							VK_SUBGROUP_FEATURE_BASIC_BIT |
 							VK_SUBGROUP_FEATURE_BALLOT_BIT |
-							VK_SUBGROUP_FEATURE_QUAD_BIT;
+							VK_SUBGROUP_FEATURE_QUAD_BIT |
+							VK_SUBGROUP_FEATURE_VOTE_BIT;
 			if (pdevice->rad_info.chip_class >= VI) {
 				properties->supportedOperations |=
+							VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
 							VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
 							VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
 			}
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 6bdf988d117..4a28f8bf41c 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -105,7 +105,7 @@ def __init__(self, name, ext_version, enable):
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
     Extension('VK_EXT_external_memory_host',              1, 'device->rad_info.has_userptr'),
     Extension('VK_EXT_global_priority',                   1, 'device->rad_info.has_ctx_priority'),
-    Extension('VK_EXT_pci_bus_info',                      1, True),
+    Extension('VK_EXT_pci_bus_info',                      1, False),
     Extension('VK_EXT_sampler_filter_minmax',             1, 'device->rad_info.chip_class >= CIK'),
     Extension('VK_EXT_shader_viewport_index_layer',       1, True),
     Extension('VK_EXT_shader_stencil_export',             1, True),
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 64346aa340f..daabc489afb 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -691,7 +691,7 @@ radv_query_opaque_metadata(struct radv_device *device,
 	si_make_texture_descriptor(device, image, false,
 				   (VkImageViewType)image->type, image->vk_format,
 				   &fixedmapping, 0, image->info.levels - 1, 0,
-				   image->info.array_size,
+				   image->info.array_size - 1,
 				   image->info.width, image->info.height,
 				   image->info.depth,
 				   desc, NULL);
@@ -870,6 +870,14 @@ radv_image_alloc_htile(struct radv_image *image)
 	/* + 8 for storing the clear values */
 	image->clear_value_offset = image->htile_offset + image->surface.htile_size;
 	image->size = image->clear_value_offset + 8;
+	if (radv_image_is_tc_compat_htile(image)) {
+		/* Metadata for the TC-compatible HTILE hardware bug which
+		 * have to be fixed by updating ZRANGE_PRECISION when doing
+		 * fast depth clears to 0.0f.
+		 */
+		image->tc_compat_zrange_offset = image->clear_value_offset + 8;
+		image->size = image->clear_value_offset + 16;
+	}
 	image->alignment = align64(image->alignment, image->surface.htile_alignment);
 }
 
@@ -977,7 +985,7 @@ radv_image_create(VkDevice _device,
 
 	image->shareable = vk_find_struct_const(pCreateInfo->pNext,
 	                                        EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL;
-	if (!vk_format_is_depth(pCreateInfo->format) && !create_info->scanout && !image->shareable) {
+	if (!vk_format_is_depth_or_stencil(pCreateInfo->format) && !create_info->scanout && !image->shareable) {
 		image->info.surf_index = &device->image_mrt_offset_counter;
 	}
 
@@ -1014,8 +1022,8 @@ radv_image_create(VkDevice _device,
 			/* Otherwise, try to enable HTILE for depth surfaces. */
 			if (radv_image_can_enable_htile(image) &&
 			    !(device->instance->debug_flags & RADV_DEBUG_NO_HIZ)) {
-				radv_image_alloc_htile(image);
 				image->tc_compatible_htile = image->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
+				radv_image_alloc_htile(image);
 			} else {
 				image->surface.htile_size = 0;
 			}
@@ -1175,8 +1183,6 @@ radv_image_view_init(struct radv_image_view *iview,
 		 if (device->physical_device->rad_info.chip_class >= GFX9 &&
 		     vk_format_is_compressed(image->vk_format) &&
 		     !vk_format_is_compressed(iview->vk_format)) {
-			 unsigned rounded_img_w = util_next_power_of_two(iview->extent.width);
-			 unsigned rounded_img_h = util_next_power_of_two(iview->extent.height);
 			 unsigned lvl_width  = radv_minify(image->info.width , range->baseMipLevel);
 			 unsigned lvl_height = radv_minify(image->info.height, range->baseMipLevel);
 
@@ -1186,8 +1192,8 @@ radv_image_view_init(struct radv_image_view *iview,
 			 lvl_width <<= range->baseMipLevel;
 			 lvl_height <<= range->baseMipLevel;
 
-			 iview->extent.width = CLAMP(lvl_width, iview->extent.width, rounded_img_w);
-			 iview->extent.height = CLAMP(lvl_height, iview->extent.height, rounded_img_h);
+			 iview->extent.width = CLAMP(lvl_width, iview->extent.width, iview->image->surface.u.gfx9.surf_pitch);
+			 iview->extent.height = CLAMP(lvl_height, iview->extent.height, iview->image->surface.u.gfx9.surf_height);
 		 }
 	}
 
diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c
index 6f074a70b4c..e9d680437e4 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -2061,7 +2061,7 @@ radv_meta_image_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
 	itoi_bind_descriptors(cmd_buffer, &src_view, &dst_view);
 
 	if (device->physical_device->rad_info.chip_class >= GFX9 &&
-	    src->image->type == VK_IMAGE_TYPE_3D)
+	    (src->image->type == VK_IMAGE_TYPE_3D || dst->image->type == VK_IMAGE_TYPE_3D))
 		pipeline = cmd_buffer->device->meta_state.itoi.pipeline_3d;
 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
 			     VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index f56eb01dc52..8c21c423511 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2242,6 +2242,8 @@ handle_fs_inputs(struct radv_shader_context *ctx,
 
 			if (LLVMIsUndef(interp_param))
 				ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
+			if (i >= VARYING_SLOT_VAR0)
+				ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index;
 			++index;
 		} else if (i == VARYING_SLOT_CLIP_DIST0) {
 			int length = ctx->shader_info->info.ps.num_input_clips_culls;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index bced19573c1..cc025f55ea3 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3396,8 +3396,7 @@ radv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline,
 		    (pipeline->graphics.prim_restart_enable &&
 		     (device->physical_device->rad_info.family < CHIP_POLARIS10 ||
 		      (prim != V_008958_DI_PT_POINTLIST &&
-		       prim != V_008958_DI_PT_LINESTRIP &&
-		       prim != V_008958_DI_PT_TRISTRIP))))
+		       prim != V_008958_DI_PT_LINESTRIP))))
 			ia_multi_vgt_param.wd_switch_on_eop = true;
 	}
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 7e9e82e3158..585702a88b2 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -595,6 +595,7 @@ struct radv_meta_state {
 		VkPipelineLayout p_layout;
 		VkPipeline occlusion_query_pipeline;
 		VkPipeline pipeline_statistics_query_pipeline;
+		VkPipeline tfb_query_pipeline;
 	} query;
 };
 
@@ -1497,6 +1498,14 @@ struct radv_image {
 	uint64_t clear_value_offset;
 	uint64_t dcc_pred_offset;
 
+	/*
+	 * Metadata for the TC-compat zrange workaround. If the 32-bit value
+	 * stored at this offset is UINT_MAX, the driver will emit
+	 * DB_Z_INFO.ZRANGE_PRECISION=0, otherwise it will skip the
+	 * SET_CONTEXT_REG packet.
+	 */
+	uint64_t tc_compat_zrange_offset;
+
 	/* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
 	VkDeviceMemory owned_memory;
 };
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index 57ea22fb847..cdff336f8a3 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -512,11 +512,233 @@ build_pipeline_statistics_query_shader(struct radv_device *device) {
 	return b.shader;
 }
 
+static nir_shader *
+build_tfb_query_shader(struct radv_device *device)
+{
+	/* the shader this builds is roughly
+	 *
+	 * uint32_t src_stride = 32;
+	 *
+	 * location(binding = 0) buffer dst_buf;
+	 * location(binding = 1) buffer src_buf;
+	 *
+	 * void main() {
+	 *	uint64_t result[2] = {};
+	 *	bool available = false;
+	 *	uint64_t src_offset = src_stride * global_id.x;
+	 * 	uint64_t dst_offset = dst_stride * global_id.x;
+	 * 	uint64_t *src_data = src_buf[src_offset];
+	 *	uint32_t avail = (src_data[0] >> 32) &
+	 *			 (src_data[1] >> 32) &
+	 *			 (src_data[2] >> 32) &
+	 *			 (src_data[3] >> 32);
+	 *	if (avail & 0x80000000) {
+	 *		result[0] = src_data[3] - src_data[1];
+	 *		result[1] = src_data[2] - src_data[0];
+	 *		available = true;
+	 *	}
+	 * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
+	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
+	 *		if (flags & VK_QUERY_RESULT_64_BIT) {
+	 *			dst_buf[dst_offset] = result;
+	 *		} else {
+	 *			dst_buf[dst_offset] = (uint32_t)result;
+	 *		}
+	 *	}
+	 *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
+	 *		dst_buf[dst_offset + result_size] = available;
+	 * 	}
+	 * }
+	 */
+	nir_builder b;
+	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
+	b.shader->info.name = ralloc_strdup(b.shader, "tfb_query");
+	b.shader->info.cs.local_size[0] = 64;
+	b.shader->info.cs.local_size[1] = 1;
+	b.shader->info.cs.local_size[2] = 1;
+
+	/* Create and initialize local variables. */
+	nir_variable *result =
+		nir_local_variable_create(b.impl,
+					  glsl_vector_type(GLSL_TYPE_UINT64, 2),
+					  "result");
+	nir_variable *available =
+		nir_local_variable_create(b.impl, glsl_int_type(), "available");
+
+	nir_store_var(&b, result,
+		      nir_vec2(&b, nir_imm_int64(&b, 0),
+				   nir_imm_int64(&b, 0)), 0x3);
+	nir_store_var(&b, available, nir_imm_int(&b, 0), 0x1);
+
+	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
+
+	/* Load resources. */
+	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
+	                                                          nir_intrinsic_vulkan_resource_index);
+	dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_desc_set(dst_buf, 0);
+	nir_intrinsic_set_binding(dst_buf, 0);
+	nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
+	nir_builder_instr_insert(&b, &dst_buf->instr);
+
+	nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
+	                                                          nir_intrinsic_vulkan_resource_index);
+	src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+	nir_intrinsic_set_desc_set(src_buf, 0);
+	nir_intrinsic_set_binding(src_buf, 1);
+	nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
+	nir_builder_instr_insert(&b, &src_buf->instr);
+
+	/* Compute global ID. */
+	nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
+	nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
+	nir_ssa_def *block_size = nir_imm_ivec4(&b,
+	                                        b.shader->info.cs.local_size[0],
+	                                        b.shader->info.cs.local_size[1],
+	                                        b.shader->info.cs.local_size[2], 0);
+	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
+
+	/* Compute src/dst strides. */
+	nir_ssa_def *input_stride = nir_imm_int(&b, 32);
+	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
+	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
+	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
+
+	/* Load data from the query pool. */
+	nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
+	load1->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
+	load1->src[1] = nir_src_for_ssa(input_base);
+	nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL);
+	load1->num_components = 4;
+	nir_builder_instr_insert(&b, &load1->instr);
+
+	nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
+	load2->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
+	load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16)));
+	nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL);
+	load2->num_components = 4;
+	nir_builder_instr_insert(&b, &load2->instr);
+
+	/* Check if result is available. */
+	nir_ssa_def *avails[2];
+	avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1),
+				 nir_channel(&b, &load1->dest.ssa, 3));
+	avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
+				 nir_channel(&b, &load2->dest.ssa, 3));
+	nir_ssa_def *result_is_available =
+		nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
+			     nir_imm_int(&b, 0x80000000));
+
+	/* Only compute result if available. */
+	nir_if *available_if = nir_if_create(b.shader);
+	available_if->condition = nir_src_for_ssa(result_is_available);
+	nir_cf_node_insert(b.cursor, &available_if->cf_node);
+
+	b.cursor = nir_after_cf_list(&available_if->then_list);
+
+	/* Pack values. */
+	nir_ssa_def *packed64[4];
+	packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b,
+						    nir_channel(&b, &load1->dest.ssa, 0),
+						    nir_channel(&b, &load1->dest.ssa, 1)));
+	packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b,
+						    nir_channel(&b, &load1->dest.ssa, 2),
+						    nir_channel(&b, &load1->dest.ssa, 3)));
+	packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b,
+						    nir_channel(&b, &load2->dest.ssa, 0),
+						    nir_channel(&b, &load2->dest.ssa, 1)));
+	packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b,
+						    nir_channel(&b, &load2->dest.ssa, 2),
+						    nir_channel(&b, &load2->dest.ssa, 3)));
+
+	/* Compute result. */
+	nir_ssa_def *num_primitive_written =
+		nir_isub(&b, packed64[3], packed64[1]);
+	nir_ssa_def *primitive_storage_needed =
+		nir_isub(&b, packed64[2], packed64[0]);
+
+	nir_store_var(&b, result,
+		      nir_vec2(&b, num_primitive_written,
+				   primitive_storage_needed), 0x3);
+	nir_store_var(&b, available, nir_imm_int(&b, 1), 0x1);
+
+	b.cursor = nir_after_cf_node(&available_if->cf_node);
+
+	/* Determine if result is 64 or 32 bit. */
+	nir_ssa_def *result_is_64bit =
+		nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
+	nir_ssa_def *result_size =
+		nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16),
+			  nir_imm_int(&b, 8));
+
+	/* Store the result if complete or partial results have been requested. */
+	nir_if *store_if = nir_if_create(b.shader);
+	store_if->condition =
+		nir_src_for_ssa(nir_ior(&b, nir_iand(&b, flags,
+						     nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)),
+					nir_load_var(&b, available)));
+	nir_cf_node_insert(b.cursor, &store_if->cf_node);
+
+	b.cursor = nir_after_cf_list(&store_if->then_list);
+
+	/* Store result. */
+	nir_if *store_64bit_if = nir_if_create(b.shader);
+	store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
+	nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
+
+	b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
+
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
+	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
+	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
+	store->src[2] = nir_src_for_ssa(output_base);
+	nir_intrinsic_set_write_mask(store, 0x3);
+	store->num_components = 2;
+	nir_builder_instr_insert(&b, &store->instr);
+
+	b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
+
+	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
+	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
+	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
+	store->src[2] = nir_src_for_ssa(output_base);
+	nir_intrinsic_set_write_mask(store, 0x3);
+	store->num_components = 2;
+	nir_builder_instr_insert(&b, &store->instr);
+
+	b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
+
+	b.cursor = nir_after_cf_node(&store_if->cf_node);
+
+	/* Store the availability bit if requested. */
+	nir_if *availability_if = nir_if_create(b.shader);
+	availability_if->condition =
+		nir_src_for_ssa(nir_iand(&b, flags,
+					 nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
+	nir_cf_node_insert(b.cursor, &availability_if->cf_node);
+
+	b.cursor = nir_after_cf_list(&availability_if->then_list);
+
+	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
+	store->src[0] = nir_src_for_ssa(nir_load_var(&b, available));
+	store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
+	store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
+	nir_intrinsic_set_write_mask(store, 0x1);
+	store->num_components = 1;
+	nir_builder_instr_insert(&b, &store->instr);
+
+	b.cursor = nir_after_cf_node(&availability_if->cf_node);
+
+	return b.shader;
+}
+
 static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
 {
 	VkResult result;
 	struct radv_shader_module occlusion_cs = { .nir = NULL };
 	struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
+	struct radv_shader_module tfb_cs = { .nir = NULL };
 
 	mtx_lock(&device->meta_state.mtx);
 	if (device->meta_state.query.pipeline_statistics_query_pipeline) {
@@ -525,6 +747,7 @@ static VkResult radv_device_init_meta_query_state_internal(struct radv_device *d
 	}
 	occlusion_cs.nir = build_occlusion_query_shader(device);
 	pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
+	tfb_cs.nir = build_tfb_query_shader(device);
 
 	VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
 		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@@ -611,12 +834,34 @@ static VkResult radv_device_init_meta_query_state_internal(struct radv_device *d
 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
 					     1, &pipeline_statistics_vk_pipeline_info, NULL,
 					     &device->meta_state.query.pipeline_statistics_query_pipeline);
+	if (result != VK_SUCCESS)
+		goto fail;
 
+	VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
+		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
+		.module = radv_shader_module_to_handle(&tfb_cs),
+		.pName = "main",
+		.pSpecializationInfo = NULL,
+	};
+
+	VkComputePipelineCreateInfo tfb_pipeline_info = {
+		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+		.stage = tfb_pipeline_shader_stage,
+		.flags = 0,
+		.layout = device->meta_state.query.p_layout,
+	};
+
+	result = radv_CreateComputePipelines(radv_device_to_handle(device),
+					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
+					     1, &tfb_pipeline_info, NULL,
+					     &device->meta_state.query.tfb_query_pipeline);
 fail:
 	if (result != VK_SUCCESS)
 		radv_device_finish_meta_query_state(device);
 	ralloc_free(occlusion_cs.nir);
 	ralloc_free(pipeline_statistics_cs.nir);
+	ralloc_free(tfb_cs.nir);
 	mtx_unlock(&device->meta_state.mtx);
 	return result;
 }
@@ -631,6 +876,11 @@ VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_d
 
 void radv_device_finish_meta_query_state(struct radv_device *device)
 {
+	if (device->meta_state.query.tfb_query_pipeline)
+		radv_DestroyPipeline(radv_device_to_handle(device),
+				     device->meta_state.query.tfb_query_pipeline,
+				     &device->meta_state.alloc);
+
 	if (device->meta_state.query.pipeline_statistics_query_pipeline)
 		radv_DestroyPipeline(radv_device_to_handle(device),
 				     device->meta_state.query.pipeline_statistics_query_pipeline,
@@ -663,6 +913,7 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
 {
 	struct radv_device *device = cmd_buffer->device;
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	if (!*pipeline) {
 		VkResult ret = radv_device_init_meta_query_state_internal(device);
@@ -677,6 +928,12 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	struct radv_buffer dst_buffer = {
 		.bo = dst_bo,
 		.offset = dst_offset,
@@ -758,6 +1015,8 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
 	                                RADV_CMD_FLAG_INV_VMEM_L1 |
 	                                RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
 
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
@@ -1082,10 +1341,13 @@ void radv_CmdCopyQueryPoolResults(
 
 
 			if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+				/* Wait on the high 32 bits of the timestamp in
+				 * case the low part is 0xffffffff.
+				 */
 				radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false));
 				radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
-				radeon_emit(cs, local_src_va);
-				radeon_emit(cs, local_src_va >> 32);
+				radeon_emit(cs, local_src_va + 4);
+				radeon_emit(cs, (local_src_va + 4) >> 32);
 				radeon_emit(cs, TIMESTAMP_NOT_READY >> 32);
 				radeon_emit(cs, 0xffffffff);
 				radeon_emit(cs, 4);
@@ -1115,6 +1377,33 @@ void radv_CmdCopyQueryPoolResults(
 			assert(cs->cdw <= cdw_max);
 		}
 		break;
+	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+			for(unsigned i = 0; i < queryCount; i++) {
+				unsigned query = firstQuery + i;
+				uint64_t src_va = va + query * pool->stride;
+
+				/* Wait on the upper word of all results. */
+				for (unsigned j = 0; j < 4; j++, src_va += 8) {
+					radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+					radeon_emit(cs, WAIT_REG_MEM_GREATER_OR_EQUAL |
+							WAIT_REG_MEM_MEM_SPACE(1));
+					radeon_emit(cs, (src_va + 4));
+					radeon_emit(cs, (src_va + 4) >> 32);
+					radeon_emit(cs, 0x80000000); /* reference value */
+					radeon_emit(cs, 0xffffffff); /* mask */
+					radeon_emit(cs, 4); /* poll interval */
+				}
+			}
+		}
+
+		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
+		                  pool->bo, dst_buffer->bo,
+				  firstQuery * pool->stride,
+		                  dst_buffer->offset + dstOffset,
+		                  pool->stride, stride,
+				  queryCount, flags, 0, 0);
+		break;
 	default:
 		unreachable("trying to get results of unhandled query type");
 	}
@@ -1161,6 +1450,22 @@ static unsigned event_type_for_stream(unsigned stream)
 	}
 }
 
+static void emit_query_flush(struct radv_cmd_buffer *cmd_buffer,
+			     struct radv_query_pool *pool)
+{
+	if (cmd_buffer->pending_reset_query) {
+		if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
+			/* Only need to flush caches if the query pool size is
+			 * large enough to be resetted using the compute shader
+			 * path. Small pools don't need any cache flushes
+			 * because we use a CP dma clear.
+			 */
+			si_emit_cache_flush(cmd_buffer);
+			cmd_buffer->pending_reset_query = false;
+		}
+	}
+}
+
 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
 			     uint64_t va,
 			     VkQueryType query_type,
@@ -1307,17 +1612,7 @@ void radv_CmdBeginQueryIndexedEXT(
 
 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
 
-	if (cmd_buffer->pending_reset_query) {
-		if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
-			/* Only need to flush caches if the query pool size is
-			 * large enough to be resetted using the compute shader
-			 * path. Small pools don't need any cache flushes
-			 * because we use a CP dma clear.
-			 */
-			si_emit_cache_flush(cmd_buffer);
-			cmd_buffer->pending_reset_query = false;
-		}
-	}
+	emit_query_flush(cmd_buffer, pool);
 
 	va += pool->stride * query;
 
@@ -1394,6 +1689,8 @@ void radv_CmdWriteTimestamp(
 
 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
 
+	emit_query_flush(cmd_buffer, pool);
+
 	int num_queries = 1;
 	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
 		num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
index 7977d46229e..e9d541ab150 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -223,6 +223,8 @@ struct radeon_winsys {
 
 	void (*buffer_set_metadata)(struct radeon_winsys_bo *bo,
 				    struct radeon_bo_metadata *md);
+	void (*buffer_get_metadata)(struct radeon_winsys_bo *bo,
+				    struct radeon_bo_metadata *md);
 
 	void (*buffer_virtual_bind)(struct radeon_winsys_bo *parent,
 	                            uint64_t offset, uint64_t size,
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
index 25764d93f6a..ec126bfc7cb 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
@@ -304,8 +304,12 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws,
 		return NULL;
 	}
 
+	unsigned virt_alignment = alignment;
+	if (size >= ws->info.pte_fragment_size)
+		virt_alignment = MAX2(virt_alignment, ws->info.pte_fragment_size);
+
 	r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-				  size, alignment, 0, &va, &va_handle,
+				  size, virt_alignment, 0, &va, &va_handle,
 				  (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
 				   AMDGPU_VA_RANGE_HIGH);
 	if (r)
@@ -536,6 +540,21 @@ radv_amdgpu_winsys_get_fd(struct radeon_winsys *_ws,
 	return true;
 }
 
+static unsigned eg_tile_split(unsigned tile_split)
+{
+	switch (tile_split) {
+	case 0:     tile_split = 64;    break;
+	case 1:     tile_split = 128;   break;
+	case 2:     tile_split = 256;   break;
+	case 3:     tile_split = 512;   break;
+	default:
+	case 4:     tile_split = 1024;  break;
+	case 5:     tile_split = 2048;  break;
+	case 6:     tile_split = 4096;  break;
+	}
+	return tile_split;
+}
+
 static unsigned radv_eg_tile_split_rev(unsigned eg_tile_split)
 {
 	switch (eg_tile_split) {
@@ -589,6 +608,43 @@ radv_amdgpu_winsys_bo_set_metadata(struct radeon_winsys_bo *_bo,
 	amdgpu_bo_set_metadata(bo->bo, &metadata);
 }
 
+static void
+radv_amdgpu_winsys_bo_get_metadata(struct radeon_winsys_bo *_bo,
+                                   struct radeon_bo_metadata *md)
+{
+	struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
+	struct amdgpu_bo_info info = {0};
+
+	int r = amdgpu_bo_query_info(bo->bo, &info);
+	if (r)
+		return;
+
+	uint64_t tiling_flags = info.metadata.tiling_info;
+
+	if (bo->ws->info.chip_class >= GFX9) {
+		md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
+	} else {
+		md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
+		md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
+
+		if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
+			md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
+		else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
+			md->u.legacy.microtile = RADEON_LAYOUT_TILED;
+
+		md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
+		md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
+		md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
+		md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
+		md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
+		md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
+		md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+	}
+
+	md->size_metadata = info.metadata.size_metadata;
+	memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
+}
+
 void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws)
 {
 	ws->base.buffer_create = radv_amdgpu_winsys_bo_create;
@@ -599,5 +655,6 @@ void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws)
 	ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd;
 	ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd;
 	ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata;
+	ws->base.buffer_get_metadata = radv_amdgpu_winsys_bo_get_metadata;
 	ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind;
 }
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index 4f3b621fd29..54483195952 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -392,6 +392,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
                         add_write_dep(state, &state->last_r[i], n);
                 add_write_dep(state, &state->last_sf, n);
+                add_write_dep(state, &state->last_rtop, n);
 
                 /* Scoreboard-locking operations have to stay after the last
                  * thread switch.
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index b5a7b841ef6..4baadce294c 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -364,6 +364,7 @@ v3d_dump_qpu(struct v3d_compile *c)
         for (int i = 0; i < c->qpu_inst_count; i++) {
                 const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
                 fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str);
+                ralloc_free((void *)str);
         }
         fprintf(stderr, "\n");
 }
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 0846cc86174..147017a6594 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -551,6 +551,7 @@ bool
 v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr)
 {
         return (waddr == V3D_QPU_WADDR_SYNC ||
+                waddr == V3D_QPU_WADDR_SYNCB ||
                 waddr == V3D_QPU_WADDR_SYNCU);
 }
 
diff --git a/src/compiler/Android.glsl.mk b/src/compiler/Android.glsl.mk
index 0aabafa2673..37b3cb80251 100644
--- a/src/compiler/Android.glsl.mk
+++ b/src/compiler/Android.glsl.mk
@@ -48,7 +48,7 @@ LOCAL_STATIC_LIBRARIES := \
 	libmesa_nir
 
 LOCAL_MODULE := libmesa_glsl
-
+LOCAL_CFLAGS += -Wno-error
 include $(LOCAL_PATH)/Android.glsl.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk
index 75a247a245d..59da5dbdc1c 100644
--- a/src/compiler/Android.nir.mk
+++ b/src/compiler/Android.nir.mk
@@ -41,6 +41,9 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
+LOCAL_CFLAGS := \
+        -Wno-missing-braces
+
 LOCAL_STATIC_LIBRARIES := libmesa_compiler
 
 LOCAL_MODULE := libmesa_nir
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 084b7021a9f..f4bd8c17db3 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -892,7 +892,8 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    }
    if (unsized_array) {
       if (is_initializer) {
-         return rhs;
+         if (rhs->type->get_scalar_type() == lhs->type->get_scalar_type())
+            return rhs;
       } else {
          _mesa_glsl_error(&loc, state,
                           "implicitly sized arrays cannot be assigned");
@@ -7422,7 +7423,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                   if (member_align == 0 ||
                       member_align & (member_align - 1)) {
                      _mesa_glsl_error(&loc, state, "align layout qualifier "
-                                      "in not a power of 2");
+                                      "is not a power of 2");
                   } else {
                      fields[i].offset = glsl_align(offset, member_align);
                      next_offset = glsl_align(fields[i].offset + size, align);
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index 5650365d1d5..b6018806865 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -525,12 +525,6 @@ supports_nv_fragment_shader_interlock(const _mesa_glsl_parse_state *state)
    return state->NV_fragment_shader_interlock_enable;
 }
 
-static bool
-supports_intel_fragment_shader_ordering(const _mesa_glsl_parse_state *state)
-{
-   return state->INTEL_fragment_shader_ordering_enable;
-}
-
 static bool
 shader_clock(const _mesa_glsl_parse_state *state)
 {
@@ -1311,11 +1305,6 @@ builtin_builder::create_intrinsics()
                    supports_arb_fragment_shader_interlock,
                    ir_intrinsic_end_invocation_interlock), NULL);
 
-   add_function("__intrinsic_begin_fragment_shader_ordering",
-                _invocation_interlock_intrinsic(
-                   supports_intel_fragment_shader_ordering,
-                   ir_intrinsic_begin_fragment_shader_ordering), NULL);
-
    add_function("__intrinsic_shader_clock",
                 _shader_clock_intrinsic(shader_clock,
                                         glsl_type::uvec2_type),
@@ -3430,12 +3419,6 @@ builtin_builder::create_builtins()
                    supports_nv_fragment_shader_interlock),
                 NULL);
 
-   add_function("beginFragmentShaderOrderingINTEL",
-                _invocation_interlock(
-                   "__intrinsic_begin_fragment_shader_ordering",
-                   supports_intel_fragment_shader_ordering),
-                NULL);
-
    add_function("anyInvocationARB",
                 _vote("__intrinsic_vote_any", vote),
                 NULL);
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 1c095cb66f9..c951d9526ac 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -224,10 +224,12 @@ expanded_line:
 			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		_glcpp_parser_skip_stack_change_if (parser, & @1, "elif", $2.value);
 	}
-|	LINE_EXPANDED integer_constant NEWLINE {
+|	LINE_EXPANDED expression NEWLINE {
+		if (parser->is_gles && $2.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $2.undefined_macro);
 		parser->has_new_line_number = 1;
-		parser->new_line_number = $2;
-		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2);
+		parser->new_line_number = $2.value;
+		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX "\n", $2.value);
 	}
 |	LINE_EXPANDED integer_constant integer_constant NEWLINE {
 		parser->has_new_line_number = 1;
@@ -238,6 +240,17 @@ expanded_line:
 					   "#line %" PRIiMAX " %" PRIiMAX "\n",
 					    $2, $3);
 	}
+|	LINE_EXPANDED '(' expression ')' '(' expression ')' NEWLINE {
+		if (parser->is_gles && $3.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $3.undefined_macro);
+		if (parser->is_gles && $6.undefined_macro)
+			glcpp_error(& @1, parser, "undefined macro %s in expression (illegal in GLES)", $6.undefined_macro);
+		parser->has_new_line_number = 1;
+		parser->new_line_number = $3.value;
+		parser->has_new_source_number = 1;
+		parser->new_source_number = $6.value;
+		_mesa_string_buffer_printf(parser->output, "#line %" PRIiMAX " %" PRIiMAX "\n", $3.value, $6.value);
+	}
 ;
 
 define:
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 1bdd7c4bf17..efd1a013dbd 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -727,7 +727,6 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT_AEP(EXT_texture_buffer),
    EXT_AEP(EXT_texture_cube_map_array),
    EXT(INTEL_conservative_rasterization),
-   EXT(INTEL_fragment_shader_ordering),
    EXT(INTEL_shader_atomic_float_minmax),
    EXT(MESA_shader_integer_functions),
    EXT(NV_fragment_shader_interlock),
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 966d848509c..69aa6cf9cf3 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -812,8 +812,6 @@ struct _mesa_glsl_parse_state {
    bool EXT_texture_cube_map_array_warn;
    bool INTEL_conservative_rasterization_enable;
    bool INTEL_conservative_rasterization_warn;
-   bool INTEL_fragment_shader_ordering_enable;
-   bool INTEL_fragment_shader_ordering_warn;
    bool INTEL_shader_atomic_float_minmax_enable;
    bool INTEL_shader_atomic_float_minmax_warn;
    bool MESA_shader_integer_functions_enable;
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
index 0479f8fcfe4..0956d2f6303 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -742,9 +742,6 @@ nir_visitor::visit(ir_call *ir)
       case ir_intrinsic_end_invocation_interlock:
          op = nir_intrinsic_end_invocation_interlock;
          break;
-      case ir_intrinsic_begin_fragment_shader_ordering:
-         op = nir_intrinsic_begin_fragment_shader_ordering;
-         break;
       case ir_intrinsic_group_memory_barrier:
          op = nir_intrinsic_group_memory_barrier;
          break;
@@ -983,9 +980,6 @@ nir_visitor::visit(ir_call *ir)
       case nir_intrinsic_end_invocation_interlock:
          nir_builder_instr_insert(&b, &instr->instr);
          break;
-      case nir_intrinsic_begin_fragment_shader_ordering:
-         nir_builder_instr_insert(&b, &instr->instr);
-         break;
       case nir_intrinsic_store_ssbo: {
          exec_node *param = ir->actual_parameters.get_head();
          ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index f478b29a6b5..d05d1998a50 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -1122,7 +1122,6 @@ enum ir_intrinsic_id {
    ir_intrinsic_memory_barrier_shared,
    ir_intrinsic_begin_invocation_interlock,
    ir_intrinsic_end_invocation_interlock,
-   ir_intrinsic_begin_fragment_shader_ordering,
 
    ir_intrinsic_vote_all,
    ir_intrinsic_vote_any,
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 52e493cb599..3969c0120b3 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -481,9 +481,10 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
             /* Component aliasing is not alloed */
             if (comp >= component && comp < last_comp) {
                linker_error(prog,
-                            "%s shader has multiple outputs explicitly "
+                            "%s shader has multiple %sputs explicitly "
                             "assigned to location %d and component %d\n",
                             _mesa_shader_stage_to_string(stage),
+                            var->data.mode == ir_var_shader_in ? "in" : "out",
                             location, comp);
                return false;
             } else {
@@ -502,10 +503,12 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
 
                if (info->interpolation != interpolation) {
                   linker_error(prog,
-                               "%s shader has multiple outputs at explicit "
+                               "%s shader has multiple %sputs at explicit "
                                "location %u with different interpolation "
                                "settings\n",
-                               _mesa_shader_stage_to_string(stage), location);
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location);
                   return false;
                }
 
@@ -513,9 +516,11 @@ check_location_aliasing(struct explicit_location_info explicit_locations[][4],
                    info->sample != sample ||
                    info->patch != patch) {
                   linker_error(prog,
-                               "%s shader has multiple outputs at explicit "
+                               "%s shader has multiple %sputs at explicit "
                                "location %u with different aux storage\n",
-                               _mesa_shader_stage_to_string(stage), location);
+                               _mesa_shader_stage_to_string(stage),
+                               var->data.mode == ir_var_shader_in ?
+                               "in" : "out", location);
                   return false;
                }
             }
diff --git a/src/compiler/glsl/serialize.cpp b/src/compiler/glsl/serialize.cpp
index 267700e7e78..26d8ec4b75b 100644
--- a/src/compiler/glsl/serialize.cpp
+++ b/src/compiler/glsl/serialize.cpp
@@ -360,13 +360,20 @@ read_xfb(struct blob_reader *metadata, struct gl_shader_program *shProg)
    if (xfb_stage == ~0u)
       return;
 
+   if (shProg->TransformFeedback.VaryingNames)  {
+      for (unsigned i = 0; i < shProg->TransformFeedback.NumVarying; ++i)
+         free(shProg->TransformFeedback.VaryingNames[i]);
+   }
+
    /* Data set by glTransformFeedbackVaryings. */
    shProg->TransformFeedback.BufferMode = blob_read_uint32(metadata);
    blob_copy_bytes(metadata, &shProg->TransformFeedback.BufferStride,
                    sizeof(shProg->TransformFeedback.BufferStride));
    shProg->TransformFeedback.NumVarying = blob_read_uint32(metadata);
+
    shProg->TransformFeedback.VaryingNames = (char **)
-      malloc(shProg->TransformFeedback.NumVarying * sizeof(GLchar *));
+      realloc(shProg->TransformFeedback.VaryingNames,
+             shProg->TransformFeedback.NumVarying * sizeof(GLchar *));
    /* Note, malloc used with VaryingNames. */
    for (unsigned i = 0; i < shProg->TransformFeedback.NumVarying; i++)
       shProg->TransformFeedback.VaryingNames[i] =
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index ec3049ca06d..910f9c336f8 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -199,7 +199,6 @@ def barrier(name):
 barrier("memory_barrier_shared")
 barrier("begin_invocation_interlock")
 barrier("end_invocation_interlock")
-barrier("begin_fragment_shader_ordering")
 
 # A conditional discard, with a single boolean source.
 intrinsic("discard_if", src_comp=[1])
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index de6f2481def..3845ed66b49 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -195,9 +195,12 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
 }
 
 static uint8_t
-get_interp_type(nir_variable *var, bool default_to_smooth_interp)
+get_interp_type(nir_variable *var, const struct glsl_type *type,
+                bool default_to_smooth_interp)
 {
-   if (var->data.interpolation != INTERP_MODE_NONE)
+   if (glsl_type_is_integer(type))
+      return INTERP_MODE_FLAT;
+   else if (var->data.interpolation != INTERP_MODE_NONE)
       return var->data.interpolation;
    else if (default_to_smooth_interp)
       return INTERP_MODE_SMOOTH;
@@ -252,7 +255,7 @@ get_slot_component_masks_and_interp_types(struct exec_list *var_list,
          unsigned comps_slot2 = 0;
          for (unsigned i = 0; i < slots; i++) {
             interp_type[location + i] =
-               get_interp_type(var, default_to_smooth_interp);
+               get_interp_type(var, type, default_to_smooth_interp);
             interp_loc[location + i] = get_interp_loc(var);
 
             if (dual_slot) {
@@ -424,7 +427,7 @@ compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
             continue;
 
          bool found_new_offset = false;
-         uint8_t interp = get_interp_type(var, default_to_smooth_interp);
+         uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
          for (; cursor[interp] < 32; cursor[interp]++) {
             uint8_t cursor_used_comps = comps[cursor[interp]];
 
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 0be3aba9456..7ef032cd164 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -194,6 +194,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
    }
 
    case nir_op_unpack_64_2x32:
+   case nir_op_unpack_32_2x16:
       return false;
 
       LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd);
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index 897a0620872..40b90e6a313 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -205,9 +205,6 @@ nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes)
 {
    bool progress = false;
 
-   if (modes == 0)
-      return false;
-
    nir_foreach_function(function, shader) {
       if (function->impl)
          progress = lower_indirects_impl(function->impl, modes) || progress;
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index 5929a60aee8..be91a2a8fd6 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -64,9 +64,8 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
          return false;
 
       if (bit_size == 0 &&
-          !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) {
+          !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_types[i]))
          bit_size = instr->src[i].src.ssa->bit_size;
-      }
 
       nir_instr *src_instr = instr->src[i].src.ssa->parent_instr;
 
diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c
index 7a21ad56c79..594c4ddd0c2 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -265,7 +265,7 @@ lookup_entry_and_kill_aliases(struct util_dynarray *copies,
 {
    /* TODO: Take into account the write_mask. */
 
-   struct copy_entry *entry = NULL;
+   nir_deref_instr *dst_match = NULL;
    util_dynarray_foreach_reverse(copies, struct copy_entry, iter) {
       if (!iter->src.is_ssa) {
          /* If this write aliases the source of some entry, get rid of it */
@@ -278,13 +278,26 @@ lookup_entry_and_kill_aliases(struct util_dynarray *copies,
       nir_deref_compare_result comp = nir_compare_derefs(iter->dst, deref);
 
       if (comp & nir_derefs_equal_bit) {
-         assert(entry == NULL);
-         entry = iter;
+         /* Removing entries invalidate previous iter pointers, so we'll
+          * collect the matching entry later.  Just make sure it is unique.
+          */
+         assert(!dst_match);
+         dst_match = iter->dst;
       } else if (comp & nir_derefs_may_alias_bit) {
          copy_entry_remove(copies, iter);
       }
    }
 
+   struct copy_entry *entry = NULL;
+   if (dst_match) {
+      util_dynarray_foreach(copies, struct copy_entry, iter) {
+         if (iter->dst == dst_match) {
+            entry = iter;
+            break;
+         }
+      }
+      assert(entry);
+   }
    return entry;
 }
 
@@ -337,6 +350,9 @@ store_to_entry(struct copy_prop_var_state *state, struct copy_entry *entry,
                const struct value *value, unsigned write_mask)
 {
    if (value->is_ssa) {
+      /* Clear src if it was being used as non-SSA. */
+      if (!entry->src.is_ssa)
+         memset(entry->src.ssa, 0, sizeof(entry->src.ssa));
       entry->src.is_ssa = true;
       /* Only overwrite the written components */
       for (unsigned i = 0; i < 4; i++) {
@@ -705,9 +721,9 @@ copy_prop_vars_block(struct copy_prop_var_state *state,
             lookup_entry_for_deref(copies, src, nir_derefs_a_contains_b_bit);
          struct value value;
          if (try_load_from_entry(state, src_entry, b, intrin, src, &value)) {
+            /* If load works, intrin (the copy_deref) is removed. */
             if (value.is_ssa) {
                nir_store_deref(b, dst, value.ssa[0], 0xf);
-               intrin = nir_instr_as_intrinsic(nir_builder_last_instr(b));
             } else {
                /* If this would be a no-op self-copy, don't bother. */
                if (nir_compare_derefs(value.deref, dst) & nir_derefs_equal_bit)
diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index 1fe95e53766..8a971c43f24 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -391,6 +391,34 @@ evaluate_if_condition(nir_if *nif, nir_cursor cursor, bool *value)
    }
 }
 
+static nir_ssa_def *
+clone_alu_and_replace_src_defs(nir_builder *b, const nir_alu_instr *alu,
+                               nir_ssa_def **src_defs)
+{
+   nir_alu_instr *nalu = nir_alu_instr_create(b->shader, alu->op);
+   nalu->exact = alu->exact;
+
+   nir_ssa_dest_init(&nalu->instr, &nalu->dest.dest,
+                     alu->dest.dest.ssa.num_components,
+                     alu->dest.dest.ssa.bit_size, alu->dest.dest.ssa.name);
+
+   nalu->dest.saturate = alu->dest.saturate;
+   nalu->dest.write_mask = alu->dest.write_mask;
+
+   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
+      assert(alu->src[i].src.is_ssa);
+      nalu->src[i].src = nir_src_for_ssa(src_defs[i]);
+      nalu->src[i].negate = alu->src[i].negate;
+      nalu->src[i].abs = alu->src[i].abs;
+      memcpy(nalu->src[i].swizzle, alu->src[i].swizzle,
+             sizeof(nalu->src[i].swizzle));
+   }
+
+   nir_builder_instr_insert(b, &nalu->instr);
+
+   return &nalu->dest.dest.ssa;;
+}
+
 /*
  * This propagates if condition evaluation down the chain of some alu
  * instructions. For example by checking the use of some of the following alu
@@ -448,7 +476,7 @@ propagate_condition_eval(nir_builder *b, nir_if *nif, nir_src *use_src,
    if (!evaluate_if_condition(nif, b->cursor, &bool_value))
       return false;
 
-   nir_ssa_def *def[2] = {0};
+   nir_ssa_def *def[4] = {0};
    for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
       if (alu->src[i].src.ssa == use_src->ssa) {
          def[i] = nir_imm_bool(b, bool_value);
@@ -456,7 +484,8 @@ propagate_condition_eval(nir_builder *b, nir_if *nif, nir_src *use_src,
          def[i] = alu->src[i].src.ssa;
       }
    }
-   nir_ssa_def *nalu = nir_build_alu(b, alu->op, def[0], def[1], NULL, NULL);
+
+   nir_ssa_def *nalu = clone_alu_and_replace_src_defs(b, alu, def);
 
    /* Rewrite use to use new alu instruction */
    nir_src new_src = nir_src_for_ssa(nalu);
@@ -472,14 +501,21 @@ propagate_condition_eval(nir_builder *b, nir_if *nif, nir_src *use_src,
 static bool
 can_propagate_through_alu(nir_src *src)
 {
-   if (src->parent_instr->type == nir_instr_type_alu &&
-       (nir_instr_as_alu(src->parent_instr)->op == nir_op_ior ||
-        nir_instr_as_alu(src->parent_instr)->op == nir_op_iand ||
-        nir_instr_as_alu(src->parent_instr)->op == nir_op_inot ||
-        nir_instr_as_alu(src->parent_instr)->op == nir_op_b2i))
-      return true;
+   if (src->parent_instr->type != nir_instr_type_alu)
+      return false;
 
-   return false;
+   nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr);
+   switch (alu->op) {
+      case nir_op_ior:
+      case nir_op_iand:
+      case nir_op_inot:
+      case nir_op_b2i:
+         return true;
+      case nir_op_bcsel:
+         return src == &alu->src[0].src;
+      default:
+         return false;
+   }
 }
 
 static bool
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index d24f0941519..3cd61f66056 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -301,6 +301,11 @@ glsl_type_is_boolean(const struct glsl_type *type)
 {
    return type->is_boolean();
 }
+bool
+glsl_type_is_integer(const struct glsl_type *type)
+{
+   return type->is_integer();
+}
 
 const glsl_type *
 glsl_void_type(void)
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 77454fa9fab..70d593b96ab 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -142,6 +142,7 @@ bool glsl_type_is_image(const struct glsl_type *type);
 bool glsl_type_is_dual_slot(const struct glsl_type *type);
 bool glsl_type_is_numeric(const struct glsl_type *type);
 bool glsl_type_is_boolean(const struct glsl_type *type);
+bool glsl_type_is_integer(const struct glsl_type *type);
 bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
 bool glsl_sampler_type_is_array(const struct glsl_type *type);
 bool glsl_contains_atomic(const struct glsl_type *type);
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 96ff09c3659..16d9c92046e 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1811,6 +1811,26 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
             src[j] = src_val->constant->values[0];
          }
 
+         /* fix up fixed size sources */
+         switch (op) {
+         case nir_op_ishl:
+         case nir_op_ishr:
+         case nir_op_ushr: {
+            if (bit_size == 32)
+               break;
+            for (unsigned i = 0; i < num_components; ++i) {
+               switch (bit_size) {
+               case 64: src[1].u32[i] = src[1].u64[i]; break;
+               case 16: src[1].u32[i] = src[1].u16[i]; break;
+               case  8: src[1].u32[i] = src[1].u8[i];  break;
+               }
+            }
+            break;
+         }
+         default:
+            break;
+         }
+
          val->constant->values[0] =
             nir_eval_const_opcode(op, num_components, bit_size, src);
          break;
@@ -2874,13 +2894,19 @@ vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert,
    return &vec->dest.dest.ssa;
 }
 
+static nir_ssa_def *
+nir_ieq_imm(nir_builder *b, nir_ssa_def *x, uint64_t i)
+{
+   return nir_ieq(b, x, nir_imm_intN_t(b, i, x->bit_size));
+}
+
 nir_ssa_def *
 vtn_vector_extract_dynamic(struct vtn_builder *b, nir_ssa_def *src,
                            nir_ssa_def *index)
 {
    nir_ssa_def *dest = vtn_vector_extract(b, src, 0);
    for (unsigned i = 1; i < src->num_components; i++)
-      dest = nir_bcsel(&b->nb, nir_ieq(&b->nb, index, nir_imm_int(&b->nb, i)),
+      dest = nir_bcsel(&b->nb, nir_ieq_imm(&b->nb, index, i),
                        vtn_vector_extract(b, src, i), dest);
 
    return dest;
@@ -2892,7 +2918,7 @@ vtn_vector_insert_dynamic(struct vtn_builder *b, nir_ssa_def *src,
 {
    nir_ssa_def *dest = vtn_vector_insert(b, src, insert, 0);
    for (unsigned i = 1; i < src->num_components; i++)
-      dest = nir_bcsel(&b->nb, nir_ieq(&b->nb, index, nir_imm_int(&b->nb, i)),
+      dest = nir_bcsel(&b->nb, nir_ieq_imm(&b->nb, index, i),
                        vtn_vector_insert(b, src, insert, i), dest);
 
    return dest;
diff --git a/src/compiler/spirv/vtn_alu.c b/src/compiler/spirv/vtn_alu.c
index 6860e7dc090..a23f8c29b5c 100644
--- a/src/compiler/spirv/vtn_alu.c
+++ b/src/compiler/spirv/vtn_alu.c
@@ -696,6 +696,17 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
          src[1] = tmp;
       }
 
+      switch (op) {
+      case nir_op_ishl:
+      case nir_op_ishr:
+      case nir_op_ushr:
+         if (src[1]->bit_size != 32)
+            src[1] = nir_u2u32(&b->nb, src[1]);
+         break;
+      default:
+         break;
+      }
+
       val->ssa->def = nir_build_alu(&b->nb, op, src[0], src[1], src[2], src[3]);
       break;
    } /* default */
diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c
index 726f717e8d5..6406f4911df 100644
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -47,6 +47,7 @@ vtn_type_count_function_params(struct vtn_type *type)
 {
    switch (type->base_type) {
    case vtn_base_type_array:
+   case vtn_base_type_matrix:
       return type->length * vtn_type_count_function_params(type->array_element);
 
    case vtn_base_type_struct: {
@@ -76,6 +77,7 @@ vtn_type_add_to_function_params(struct vtn_type *type,
 
    switch (type->base_type) {
    case vtn_base_type_array:
+   case vtn_base_type_matrix:
       for (unsigned i = 0; i < type->length; i++)
          vtn_type_add_to_function_params(type->array_element, func, param_idx);
       break;
@@ -123,6 +125,7 @@ vtn_ssa_value_add_to_call_params(struct vtn_builder *b,
 {
    switch (type->base_type) {
    case vtn_base_type_array:
+   case vtn_base_type_matrix:
       for (unsigned i = 0; i < type->length; i++) {
          vtn_ssa_value_add_to_call_params(b, value->elems[i],
                                           type->array_element,
@@ -152,6 +155,7 @@ vtn_ssa_value_load_function_param(struct vtn_builder *b,
 {
    switch (type->base_type) {
    case vtn_base_type_array:
+   case vtn_base_type_matrix:
       for (unsigned i = 0; i < type->length; i++) {
          vtn_ssa_value_load_function_param(b, value->elems[i],
                                            type->array_element, param_idx);
diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 06a49e48e3f..0d8100384d6 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -807,10 +807,9 @@ handle_glsl450_interpolation(struct vtn_builder *b, enum GLSLstd450 opcode,
 
    if (vec_array_deref) {
       assert(vec_deref);
-      nir_const_value *const_index = nir_src_as_const_value(vec_deref->arr.index);
-      if (const_index) {
+      if (nir_src_is_const(vec_deref->arr.index)) {
          val->ssa->def = vtn_vector_extract(b, &intrin->dest.ssa,
-                                            const_index->u32[0]);
+                                            nir_src_as_uint(vec_deref->arr.index));
       } else {
          val->ssa->def = vtn_vector_extract_dynamic(b, &intrin->dest.ssa,
                                                     vec_deref->arr.index.ssa);
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index c5cf345d02a..0eb9f263436 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -132,12 +132,12 @@ vtn_access_link_as_ssa(struct vtn_builder *b, struct vtn_access_link link,
    } else if (stride == 1) {
        nir_ssa_def *ssa = vtn_ssa_value(b, link.id)->def;
        if (ssa->bit_size != 32)
-          ssa = nir_u2u32(&b->nb, ssa);
+          ssa = nir_i2i32(&b->nb, ssa);
       return ssa;
    } else {
       nir_ssa_def *src0 = vtn_ssa_value(b, link.id)->def;
       if (src0->bit_size != 32)
-         src0 = nir_u2u32(&b->nb, src0);
+         src0 = nir_i2i32(&b->nb, src0);
       return nir_imul(&b->nb, src0, nir_imm_int(&b->nb, stride));
    }
 }
@@ -512,9 +512,9 @@ vtn_local_load(struct vtn_builder *b, nir_deref_instr *src)
 
    if (src_tail != src) {
       val->type = src->type;
-      nir_const_value *const_index = nir_src_as_const_value(src->arr.index);
-      if (const_index)
-         val->def = vtn_vector_extract(b, val->def, const_index->u32[0]);
+      if (nir_src_is_const(src->arr.index))
+         val->def = vtn_vector_extract(b, val->def,
+                                       nir_src_as_uint(src->arr.index));
       else
          val->def = vtn_vector_extract_dynamic(b, val->def, src->arr.index.ssa);
    }
@@ -532,10 +532,9 @@ vtn_local_store(struct vtn_builder *b, struct vtn_ssa_value *src,
       struct vtn_ssa_value *val = vtn_create_ssa_value(b, dest_tail->type);
       _vtn_local_load_store(b, true, dest_tail, val);
 
-      nir_const_value *const_index = nir_src_as_const_value(dest->arr.index);
-      if (const_index)
+      if (nir_src_is_const(dest->arr.index))
          val->def = vtn_vector_insert(b, val->def, src->def,
-                                      const_index->u32[0]);
+                                      nir_src_as_uint(dest->arr.index));
       else
          val->def = vtn_vector_insert_dynamic(b, val->def, src->def,
                                               dest->arr.index.ssa);
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 42b391e6d86..bbc7df2aff8 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -45,7 +45,10 @@ LOCAL_CFLAGS := \
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include/drm-uapi \
 	$(MESA_TOP)/src/egl/main \
-	$(MESA_TOP)/src/egl/drivers/dri2
+	$(MESA_TOP)/src/egl/drivers/dri2 \
+	frameworks/native/libs/nativebase/include \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
 
 LOCAL_STATIC_LIBRARIES := \
 	libmesa_util \
@@ -64,6 +67,10 @@ ifeq ($(BOARD_USES_DRM_GRALLOC),true)
 	LOCAL_SHARED_LIBRARIES += libgralloc_drm
 endif
 
+ifeq ($(strip $(BOARD_USES_GRALLOC1)),true)
+LOCAL_CFLAGS += -DHAVE_GRALLOC1
+endif
+
 ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
 LOCAL_SHARED_LIBRARIES += libnativewindow
 endif
@@ -79,8 +86,12 @@ ifneq ($(MESA_BUILD_GALLIUM),)
 LOCAL_REQUIRED_MODULES += gallium_dri
 endif
 
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_HEADER_LIBRARIES += libnativebase_headers
+endif
+
 LOCAL_MODULE := libGLES_mesa
 LOCAL_MODULE_RELATIVE_PATH := egl
-
+LOCAL_CFLAGS += -Wno-error
 include $(MESA_COMMON_MK)
 include $(BUILD_SHARED_LIBRARY)
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 87e1a704c6e..81d4ea456b3 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -65,6 +65,38 @@
 #include "util/u_vector.h"
 #include "mapi/glapi/glapi.h"
 
+/* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
+ * some of the definitions here so that building Mesa won't bleeding-edge
+ * kernel headers.
+ */
+#ifndef DRM_FORMAT_R8
+#define DRM_FORMAT_R8            fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
+#endif
+
+#ifndef DRM_FORMAT_RG88
+#define DRM_FORMAT_RG88          fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
+#endif
+
+#ifndef DRM_FORMAT_GR88
+#define DRM_FORMAT_GR88          fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
+#endif
+
+#ifndef DRM_FORMAT_R16
+#define DRM_FORMAT_R16           fourcc_code('R', '1', '6', ' ') /* [15:0] R 16 little endian */
+#endif
+
+#ifndef DRM_FORMAT_GR1616
+#define DRM_FORMAT_GR1616        fourcc_code('G', 'R', '3', '2') /* [31:0] R:G 16:16 little endian */
+#endif
+
+#ifndef DRM_FORMAT_P010
+#define DRM_FORMAT_P010 	 fourcc_code('P', '0', '1', '0') /* 2x2 subsampled Cb:Cr plane 10 bits per channel */
+#endif
+
+#ifndef DRM_FORMAT_MOD_INVALID
+#define DRM_FORMAT_MOD_INVALID ((1ULL<<56) - 1)
+#endif
+
 #define NUM_ATTRIBS 12
 
 static void
@@ -673,7 +705,7 @@ dri2_setup_screen(_EGLDisplay *disp)
       dri2_renderer_query_integer(dri2_dpy,
                                   __DRI2_RENDERER_HAS_CONTEXT_PRIORITY);
 
-   disp->Extensions.EXT_pixel_format_float = EGL_TRUE;
+   disp->Extensions.EXT_pixel_format_float = EGL_FALSE;
 
    if (dri2_renderer_query_integer(dri2_dpy,
                                    __DRI2_RENDERER_HAS_FRAMEBUFFER_SRGB))
@@ -2284,6 +2316,7 @@ dri2_num_fourcc_format_planes(EGLint format)
    case DRM_FORMAT_NV21:
    case DRM_FORMAT_NV16:
    case DRM_FORMAT_NV61:
+   case DRM_FORMAT_P010:
       return 2;
 
    case DRM_FORMAT_YUV410:
@@ -2309,7 +2342,7 @@ dri2_check_dma_buf_format(const _EGLImageAttribs *attrs)
 {
    unsigned plane_n = dri2_num_fourcc_format_planes(attrs->DMABufFourCC.Value);
    if (plane_n == 0) {
-      _eglError(EGL_BAD_ATTRIBUTE, "invalid format");
+      _eglError(EGL_BAD_MATCH, "unknown drm fourcc format");
       return 0;
    }
 
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 4abe1ba1952..3e5a567472c 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -69,6 +69,10 @@ struct zwp_linux_dmabuf_v1;
 #include <hardware/gralloc.h>
 #endif /* HAVE_ANDROID_PLATFORM */
 
+#ifdef HAVE_GRALLOC1
+#include <hardware/gralloc1.h>
+#endif
+
 #include "eglconfig.h"
 #include "eglcontext.h"
 #include "egldevice.h"
@@ -237,7 +241,14 @@ struct dri2_egl_display
 #endif
 
 #ifdef HAVE_ANDROID_PLATFORM
-   const gralloc_module_t *gralloc;
+   const hw_module_t *gralloc;
+   uint16_t gralloc_version;
+#ifdef HAVE_GRALLOC1
+   gralloc1_device_t *gralloc1_dvc;
+   GRALLOC1_PFN_LOCK_FLEX pfn_lockflex;
+   GRALLOC1_PFN_GET_FORMAT pfn_getFormat;
+   GRALLOC1_PFN_UNLOCK pfn_unlock;
+#endif
 #endif
 
    bool                      is_render_node;
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 1e93ab4d4d2..0c79fe9b5e0 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -49,6 +49,8 @@
 
 #define ALIGN(val, align)	(((val) + (align) - 1) & ~((align) - 1))
 
+#define GRALLOC_DRM_GET_FORMAT   1
+
 struct droid_yuv_format {
    /* Lookup keys */
    int native; /* HAL_PIXEL_FORMAT_ */
@@ -59,14 +61,26 @@ struct droid_yuv_format {
    int fourcc; /* __DRI_IMAGE_FOURCC_ */
 };
 
+/* This enumeration can be deleted if Android defined it in
+ * system/core/include/system/graphics.h
+ */
+enum {
+   HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL = 0x100,
+   HAL_PIXEL_FORMAT_NV12 = 0x10F,
+   HAL_PIXEL_FORMAT_P010_INTEL = 0x110
+};
+
 /* The following table is used to look up a DRI image FourCC based
  * on native format and information contained in android_ycbcr struct. */
 static const struct droid_yuv_format droid_yuv_formats[] = {
    /* Native format, YCrCb, Chroma step, DRI image FourCC */
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   0, 2, __DRI_IMAGE_FOURCC_NV12 },
+   { HAL_PIXEL_FORMAT_P010_INTEL,      0, 4, __DRI_IMAGE_FOURCC_P010 },
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   0, 1, __DRI_IMAGE_FOURCC_YUV420 },
    { HAL_PIXEL_FORMAT_YCbCr_420_888,   1, 1, __DRI_IMAGE_FOURCC_YVU420 },
    { HAL_PIXEL_FORMAT_YV12,            1, 1, __DRI_IMAGE_FOURCC_YVU420 },
+   { HAL_PIXEL_FORMAT_NV12,            0, 2, __DRI_IMAGE_FOURCC_NV12 },
+   { HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL, 0, 2, __DRI_IMAGE_FOURCC_NV12 },
    /* HACK: See droid_create_image_from_prime_fd() and
     * https://issuetracker.google.com/32077885. */
    { HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED,   0, 2, __DRI_IMAGE_FOURCC_NV12 },
@@ -248,6 +262,51 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
    return EGL_TRUE;
 }
 
+static int
+droid_resolve_format(struct dri2_egl_display *dri2_dpy,
+                     struct ANativeWindowBuffer *buf)
+{
+   int format = -1;
+   int ret;
+
+   if (buf->format != HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+      return buf->format;
+#ifdef HAVE_GRALLOC1
+   if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+
+     if (!dri2_dpy->pfn_getFormat) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support getFormat");
+        return -1;
+     }
+     ret = dri2_dpy->pfn_getFormat(dri2_dpy->gralloc1_dvc, buf->handle,
+                                       &format);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->getFormat failed: %d", ret);
+        return -1;
+     }
+   } else {
+#else
+     const gralloc_module_t *gralloc0;
+     gralloc0 = dri2_dpy->gralloc;
+
+     if (!gralloc0->perform) {
+       _eglLog(_EGL_WARNING, "gralloc->perform not supported");
+       return -1;
+     }
+     ret = gralloc0->perform(dri2_dpy->gralloc,
+                                    GRALLOC_DRM_GET_FORMAT,
+                                    buf->handle, &format);
+     if (ret){
+       _eglLog(_EGL_WARNING, "gralloc->perform failed with error: %d", ret);
+       return -1;
+     }
+#endif
+#ifdef HAVE_GRALLOC1
+   }
+#endif
+   return format;
+}
+
 static EGLBoolean
 droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
 {
@@ -462,7 +521,7 @@ droid_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy,
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
    struct ANativeWindow *window = dri2_surf->window;
 
-   if (window->setSwapInterval(window, interval))
+   if (window && window->setSwapInterval(window, interval))
       return EGL_FALSE;
 
    surf->SwapInterval = interval;
@@ -663,11 +722,18 @@ droid_query_buffer_age(_EGLDriver *drv,
 {
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surface);
 
+   /* To avoid blocking other EGL calls, release the display mutex before
+    * we enter droid_window_dequeue_buffer() and re-acquire the mutex upon
+    * return.
+    */
+   mtx_unlock(&disp->Mutex);
    if (update_buffers(dri2_surf) < 0) {
       _eglError(EGL_BAD_ALLOC, "droid_query_buffer_age");
+      mtx_lock(&disp->Mutex);
       return -1;
    }
 
+   mtx_lock(&disp->Mutex);
    return dri2_surf->back ? dri2_surf->back->age : 0;
 }
 
@@ -730,6 +796,31 @@ droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
    return EGL_TRUE;
 }
 
+static int get_ycbcr_from_flexlayout(struct android_flex_layout *outFlexLayout, struct android_ycbcr *ycbcr)
+{
+
+    for( int i = 0; i < outFlexLayout->num_planes; i++) {
+       switch(outFlexLayout->planes[i].component){
+         case FLEX_COMPONENT_Y:
+             ycbcr->y = outFlexLayout->planes[i].top_left;
+             ycbcr->ystride = outFlexLayout->planes[i].v_increment;
+         break;
+         case FLEX_COMPONENT_Cb:
+             ycbcr->cb = outFlexLayout->planes[i].top_left;
+             ycbcr->cstride = outFlexLayout->planes[i].v_increment;
+         break;
+         case FLEX_COMPONENT_Cr:
+             ycbcr->cr = outFlexLayout->planes[i].top_left;
+             ycbcr->chroma_step = outFlexLayout->planes[i].h_increment;
+         break;
+         default:
+             _eglLog(_EGL_WARNING,"unknown component 0x%x", __func__, outFlexLayout->planes[i].component);
+         break;
+       }
+  }
+  return 0;
+}
+
 #if ANDROID_API_LEVEL >= 23
 static EGLBoolean
 droid_set_damage_region(_EGLDriver *drv,
@@ -773,30 +864,70 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct android_ycbcr ycbcr;
+#ifdef HAVE_GRALLOC1
+   struct android_flex_layout outFlexLayout;
+   gralloc1_rect_t accessRegion;
+#endif
    size_t offsets[3];
    size_t pitches[3];
    int is_ycrcb;
    int fourcc;
    int ret;
 
-   if (!dri2_dpy->gralloc->lock_ycbcr) {
-      _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+   int format = droid_resolve_format(dri2_dpy, buf);
+   if (format < 0) {
+      _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
    }
 
    memset(&ycbcr, 0, sizeof(ycbcr));
-   ret = dri2_dpy->gralloc->lock_ycbcr(dri2_dpy->gralloc, buf->handle,
-                                       0, 0, 0, 0, 0, &ycbcr);
-   if (ret) {
-      /* HACK: See droid_create_image_from_prime_fd() and
-       * https://issuetracker.google.com/32077885.*/
-      if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
-         return NULL;
-
-      _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
-      return NULL;
-   }
-   dri2_dpy->gralloc->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+   if(dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+     if (!dri2_dpy->pfn_lockflex) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support lockflex");
+        return NULL;
+     }
+
+     ret = dri2_dpy->pfn_lockflex(dri2_dpy->gralloc1_dvc, buf->handle,
+                                       0, 0, &accessRegion, &outFlexLayout, -1);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+        return NULL;
+     }
+     ret = get_ycbcr_from_flexlayout(&outFlexLayout, &ycbcr);
+     if (ret) {
+        _eglLog(_EGL_WARNING, "gralloc->lockflex failed: %d", ret);
+        return NULL;
+     }
+     int outReleaseFence = 0;
+     dri2_dpy->pfn_unlock(dri2_dpy->gralloc1_dvc, buf->handle, &outReleaseFence);
+   } else {
+#endif
+     const gralloc_module_t *gralloc0;
+     gralloc0 = dri2_dpy->gralloc;
+
+     if (!gralloc0->lock_ycbcr) {
+        _eglLog(_EGL_WARNING, "Gralloc does not support lock_ycbcr");
+        return NULL;
+     }
+
+     ret = gralloc0->lock_ycbcr(gralloc0, buf->handle,
+                                        0, 0, 0, 0, 0, &ycbcr);
+
+     if (ret) {
+        /* HACK: See droid_create_image_from_prime_fd() and
+         * https://issuetracker.google.com/32077885.*/
+        if (buf->format == HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED)
+           return NULL;
+
+        _eglLog(_EGL_WARNING, "gralloc->lock_ycbcr failed: %d", ret);
+        return NULL;
+     }
+
+     gralloc0->unlock(dri2_dpy->gralloc, buf->handle);
+#ifdef HAVE_GRALLOC1
+  }
+#endif
 
    /* When lock_ycbcr's usage argument contains no SW_READ/WRITE flags
     * it will return the .y/.cb/.cr pointers based on a NULL pointer,
@@ -821,14 +952,15 @@ droid_create_image_from_prime_fd_yuv(_EGLDisplay *disp, _EGLContext *ctx,
 
    /* .chroma_step is the byte distance between the same chroma channel
     * values of subsequent pixels, assumed to be the same for Cb and Cr. */
-   fourcc = get_fourcc_yuv(buf->format, is_ycrcb, ycbcr.chroma_step);
+   fourcc = get_fourcc_yuv(format, is_ycrcb, ycbcr.chroma_step);
    if (fourcc == -1) {
       _eglLog(_EGL_WARNING, "unsupported YUV format, native = %x, is_ycrcb = %d, chroma_step = %d",
-              buf->format, is_ycrcb, ycbcr.chroma_step);
+              format, is_ycrcb, ycbcr.chroma_step);
       return NULL;
    }
 
-   if (ycbcr.chroma_step == 2) {
+   /* FIXME? we should not rely on chroma_step */
+   if (ycbcr.chroma_step == 2 || ycbcr.chroma_step == 4) {
       /* Semi-planar Y + CbCr or Y + CrCb format. */
       const EGLint attr_list_2plane[] = {
          EGL_WIDTH, buf->width,
@@ -870,9 +1002,16 @@ static _EGLImage *
 droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
                                  struct ANativeWindowBuffer *buf, int fd)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    unsigned int pitch;
 
-   if (is_yuv(buf->format)) {
+   int format = droid_resolve_format(dri2_dpy, buf);
+   if (format < 0) {
+      _eglLog(_EGL_WARNING, "Could not resolve buffer format");
+      return NULL;
+   }
+
+   if (is_yuv(format)) {
       _EGLImage *image;
 
       image = droid_create_image_from_prime_fd_yuv(disp, ctx, buf, fd);
@@ -887,13 +1026,13 @@ droid_create_image_from_prime_fd(_EGLDisplay *disp, _EGLContext *ctx,
          return image;
    }
 
-   const int fourcc = get_fourcc(buf->format);
+   const int fourcc = get_fourcc(format);
    if (fourcc == -1) {
       _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
    }
 
-   pitch = buf->stride * get_format_bpp(buf->format);
+   pitch = buf->stride * get_format_bpp(format);
    if (pitch == 0) {
       _eglError(EGL_BAD_PARAMETER, "eglCreateEGLImageKHR");
       return NULL;
@@ -1529,6 +1668,7 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
    _EGLDevice *dev;
    struct dri2_egl_display *dri2_dpy;
    const char *err;
+   hw_device_t *device;
    int ret;
 
    /* Not supported yet */
@@ -1548,6 +1688,27 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
       err = "DRI2: failed to get gralloc module";
       goto cleanup;
    }
+   dri2_dpy->gralloc_version = dri2_dpy->gralloc->module_api_version;
+#ifdef HAVE_GRALLOC1
+   if (dri2_dpy->gralloc_version == HARDWARE_MODULE_API_VERSION(1, 0)) {
+      ret = dri2_dpy->gralloc->methods->open(dri2_dpy->gralloc, GRALLOC_HARDWARE_MODULE_ID, &device);
+      if (ret) {
+        err = "Failed to open hw_device device";
+        goto cleanup;
+      } else {
+        dri2_dpy->gralloc1_dvc = (gralloc1_device_t *)device;
+
+        dri2_dpy->pfn_lockflex = (GRALLOC1_PFN_LOCK_FLEX)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_LOCK_FLEX);
+
+        dri2_dpy->pfn_getFormat = (GRALLOC1_PFN_GET_FORMAT)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_GET_FORMAT);
+
+        dri2_dpy->pfn_unlock = (GRALLOC1_PFN_UNLOCK)\
+             dri2_dpy->gralloc1_dvc->getFunction(dri2_dpy->gralloc1_dvc, GRALLOC1_FUNCTION_UNLOCK);
+      }
+   }
+#endif
 
    disp->DriverData = (void *) dri2_dpy;
 
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index eb9f5e2b1e2..817e9b1988a 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -1127,13 +1127,22 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
    if (dri2_dpy->fd == -1) {
       _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)",
               dri2_dpy->device_name, strerror(errno));
+      free(dri2_dpy->device_name);
+      dri2_dpy->device_name = NULL;
       return;
    }
 
    if (drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER) {
       dri2_dpy->authenticated = true;
    } else {
-      drmGetMagic(dri2_dpy->fd, &magic);
+      if (drmGetMagic(dri2_dpy->fd, &magic)) {
+         close(dri2_dpy->fd);
+         dri2_dpy->fd = -1;
+         free(dri2_dpy->device_name);
+         dri2_dpy->device_name = NULL;
+         _eglLog(_EGL_WARNING, "wayland-egl: drmGetMagic failed");
+         return;
+      }
       wl_drm_authenticate(dri2_dpy->wl_drm, magic);
    }
 }
@@ -1661,8 +1670,8 @@ swrast_update_buffers(struct dri2_egl_surface *dri2_surf)
    if (dri2_surf->back)
       return 0;
 
-   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->height) {
 
       dri2_wl_release_buffers(dri2_surf);
 
diff --git a/src/egl/drivers/haiku/egl_haiku.cpp b/src/egl/drivers/haiku/egl_haiku.cpp
index a9c5cf8d29b..d4b046c79b4 100644
--- a/src/egl/drivers/haiku/egl_haiku.cpp
+++ b/src/egl/drivers/haiku/egl_haiku.cpp
@@ -29,6 +29,7 @@
 
 #include "eglconfig.h"
 #include "eglcontext.h"
+#include "egldevice.h"
 #include "egldisplay.h"
 #include "egldriver.h"
 #include "eglcurrent.h"
@@ -215,7 +216,7 @@ init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
 		_eglError(EGL_NOT_INITIALIZED, "DRI2: failed to find EGLDevice");
 		return EGL_FALSE;
 	}
-	disp->Device = dev;
+	dpy->Device = dev;
 
 	TRACE("Add configs\n");
 	if (!haiku_add_configs_for_visuals(dpy))
diff --git a/src/egl/generate/eglFunctionList.py b/src/egl/generate/eglFunctionList.py
index fb5b3c30bdf..2cd35557bc4 100644
--- a/src/egl/generate/eglFunctionList.py
+++ b/src/egl/generate/eglFunctionList.py
@@ -196,8 +196,18 @@ def _eglFunc(name, method, static=None, public=False, inheader=None, prefix="dis
     # EGL_ANDROID_native_fence_sync
     _eglFunc("eglDupNativeFenceFDANDROID",           "display"),
 
+    # EGL_ANDROID_blob_cache
+    _eglFunc("eglSetBlobCacheFuncsANDROID",          "display"),
+
     # EGL_EXT_image_dma_buf_import_modifiers
     _eglFunc("eglQueryDmaBufFormatsEXT",             "display"),
     _eglFunc("eglQueryDmaBufModifiersEXT",           "display"),
+
+    # EGL_EXT_device_base
+    _eglFunc("eglQueryDeviceAttribEXT",              "device"),
+    _eglFunc("eglQueryDeviceStringEXT",              "device"),
+    _eglFunc("eglQueryDevicesEXT",                   "none"),
+    _eglFunc("eglQueryDisplayAttribEXT",             "display"),
+
 )
 
diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c
index 7af3011b757..545697e5662 100644
--- a/src/egl/main/eglcurrent.c
+++ b/src/egl/main/eglcurrent.c
@@ -137,13 +137,37 @@ _eglDestroyThreadInfo(_EGLThreadInfo *t)
 }
 
 
+/**
+ * Delete/free a _EGLThreadInfo object.
+ */
+static void
+_eglDestroyThreadInfoCallback(_EGLThreadInfo *t)
+{
+   /* If this callback is called on thread termination then try to also give a
+    * chance to cleanup to the client drivers. If called for module termination
+    * then just release the thread information as calling eglReleaseThread
+    * would result in a deadlock.
+    */
+   if (_egl_TSDInitialized) {
+      /* The callback handler has replaced the TLS entry, which is passed in as
+       * 't', with NULL. Restore it here so that the release thread finds it in
+       * the TLS entry.
+       */
+      _eglSetTSD(t);
+      eglReleaseThread();
+   } else {
+      _eglDestroyThreadInfo(t);
+   }
+}
+
+
 /**
  * Make sure TSD is initialized and return current value.
  */
 static inline _EGLThreadInfo *
 _eglCheckedGetTSD(void)
 {
-   if (_eglInitTSD(&_eglDestroyThreadInfo) != EGL_TRUE) {
+   if (_eglInitTSD(&_eglDestroyThreadInfoCallback) != EGL_TRUE) {
       _eglLog(_EGL_FATAL, "failed to initialize \"current\" system");
       return NULL;
    }
diff --git a/src/egl/main/egldispatchstubs.c b/src/egl/main/egldispatchstubs.c
index bfc3195c779..96708aeb0dc 100644
--- a/src/egl/main/egldispatchstubs.c
+++ b/src/egl/main/egldispatchstubs.c
@@ -59,6 +59,11 @@ static __eglMustCastToProperFunctionPointerType FetchVendorFunc(__EGLvendorInfo
     }
     if (func == NULL) {
         if (errorCode != EGL_SUCCESS) {
+            // Since we have no vendor, the follow-up eglGetError() call will
+            // end up using the GLVND error code. Set it here.
+            if (vendor == NULL) {
+                exports->setEGLError(errorCode);
+            }
             _eglError(errorCode, __EGL_DISPATCH_FUNC_NAMES[index]);
         }
         return NULL;
diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk
index acd243b8346..7618c6fcd93 100644
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -36,7 +36,8 @@ LOCAL_SRC_FILES := \
 	util/u_debug_stack_android.cpp
 
 LOCAL_C_INCLUDES := \
-	$(GALLIUM_TOP)/auxiliary/util
+	$(GALLIUM_TOP)/auxiliary/util \
+	$(MESA_TOP)/src/util
 
 ifeq ($(MESA_ENABLE_LLVM),true)
 LOCAL_SRC_FILES += \
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 0ad274b535a..4fa36cc7de4 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -375,7 +375,7 @@ ttn_emit_declaration(struct ttn_compile *c)
             c->outputs[idx] = var;
 
             for (int i = 0; i < array_size; i++)
-               b->shader->info.outputs_written |= 1 << (var->data.location + i);
+               b->shader->info.outputs_written |= 1ull << (var->data.location + i);
          }
             break;
          case TGSI_FILE_CONSTANT:
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.h b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
index 05be94cae31..9b264145347 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.h
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.h
@@ -142,7 +142,7 @@ pipe_loader_release(struct pipe_loader_device **devs, int ndev);
  */
 bool
 pipe_loader_sw_probe_dri(struct pipe_loader_device **devs,
-                         struct drisw_loader_funcs *drisw_lf);
+                         const struct drisw_loader_funcs *drisw_lf);
 
 /**
  * Initialize a kms backed sw device given an fd.
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index d387ce90d32..587b6f8567b 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -132,7 +132,7 @@ pipe_loader_sw_probe_teardown_common(struct pipe_loader_sw_device *sdev)
 
 #ifdef HAVE_PIPE_LOADER_DRI
 bool
-pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, struct drisw_loader_funcs *drisw_lf)
+pipe_loader_sw_probe_dri(struct pipe_loader_device **devs, const struct drisw_loader_funcs *drisw_lf)
 {
    struct pipe_loader_sw_device *sdev = CALLOC_STRUCT(pipe_loader_sw_device);
    int i;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e13500a7f7b..75c2e08632e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -1004,11 +1004,12 @@ get_block_tessfactor_writemask(const struct tgsi_shader_info *info,
    struct tgsi_full_instruction *inst;
    unsigned writemask = 0;
 
-   do {
-      tgsi_parse_token(parse);
-      assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
-      inst = &parse->FullToken.FullInstruction;
-      check_no_subroutines(inst);
+   tgsi_parse_token(parse);
+   assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
+   inst = &parse->FullToken.FullInstruction;
+   check_no_subroutines(inst);
+
+   while (inst->Instruction.Opcode != end_opcode) {
 
       /* Recursively process nested blocks. */
       switch (inst->Instruction.Opcode) {
@@ -1016,20 +1017,26 @@ get_block_tessfactor_writemask(const struct tgsi_shader_info *info,
       case TGSI_OPCODE_UIF:
          writemask |=
             get_block_tessfactor_writemask(info, parse, TGSI_OPCODE_ENDIF);
-         continue;
+         break;
 
       case TGSI_OPCODE_BGNLOOP:
          writemask |=
             get_block_tessfactor_writemask(info, parse, TGSI_OPCODE_ENDLOOP);
-         continue;
+         break;
 
       case TGSI_OPCODE_BARRIER:
          unreachable("nested BARRIER is illegal");
-         continue;
+         break;
+
+      default:
+         writemask |= get_inst_tessfactor_writemask(info, inst);
       }
 
-      writemask |= get_inst_tessfactor_writemask(info, inst);
-   } while (inst->Instruction.Opcode != end_opcode);
+      tgsi_parse_token(parse);
+      assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
+      inst = &parse->FullToken.FullInstruction;
+      check_no_subroutines(inst);
+   }
 
    return writemask;
 }
@@ -1043,18 +1050,20 @@ get_if_block_tessfactor_writemask(const struct tgsi_shader_info *info,
    struct tgsi_full_instruction *inst;
    unsigned then_tessfactor_writemask = 0;
    unsigned else_tessfactor_writemask = 0;
+   unsigned writemask;
    bool is_then = true;
 
-   do {
-      tgsi_parse_token(parse);
-      assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
-      inst = &parse->FullToken.FullInstruction;
-      check_no_subroutines(inst);
+   tgsi_parse_token(parse);
+   assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
+   inst = &parse->FullToken.FullInstruction;
+   check_no_subroutines(inst);
+
+   while (inst->Instruction.Opcode != TGSI_OPCODE_ENDIF) {
 
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ELSE:
          is_then = false;
-         continue;
+         break;
 
       /* Recursively process nested blocks. */
       case TGSI_OPCODE_IF:
@@ -1063,28 +1072,33 @@ get_if_block_tessfactor_writemask(const struct tgsi_shader_info *info,
                                            is_then ? &then_tessfactor_writemask :
                                                      &else_tessfactor_writemask,
                                            cond_block_tf_writemask);
-         continue;
+         break;
 
       case TGSI_OPCODE_BGNLOOP:
          *cond_block_tf_writemask |=
             get_block_tessfactor_writemask(info, parse, TGSI_OPCODE_ENDLOOP);
-         continue;
+         break;
 
       case TGSI_OPCODE_BARRIER:
          unreachable("nested BARRIER is illegal");
-         continue;
-      }
-
-      /* Process an instruction in the current block. */
-      unsigned writemask = get_inst_tessfactor_writemask(info, inst);
+         break;
+      default:
+         /* Process an instruction in the current block. */
+         writemask = get_inst_tessfactor_writemask(info, inst);
 
-      if (writemask) {
-         if (is_then)
-            then_tessfactor_writemask |= writemask;
-         else
-            else_tessfactor_writemask |= writemask;
+         if (writemask) {
+            if (is_then)
+               then_tessfactor_writemask |= writemask;
+            else
+               else_tessfactor_writemask |= writemask;
+         }
       }
-   } while (inst->Instruction.Opcode != TGSI_OPCODE_ENDIF);
+
+      tgsi_parse_token(parse);
+      assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
+      inst = &parse->FullToken.FullInstruction;
+      check_no_subroutines(inst);
+   }
 
    if (then_tessfactor_writemask || else_tessfactor_writemask) {
       /* If both statements write the same tess factor channels,
@@ -1147,7 +1161,7 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens,
 
       case TGSI_OPCODE_BGNLOOP:
          cond_block_tf_writemask |=
-            get_block_tessfactor_writemask(info, &parse, TGSI_OPCODE_ENDIF);
+            get_block_tessfactor_writemask(info, &parse, TGSI_OPCODE_ENDLOOP);
          continue;
 
       case TGSI_OPCODE_BARRIER:
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index b06fb111709..fa1e920b509 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -154,6 +154,25 @@ pipe_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
    *dst = src;
 }
 
+/**
+ * Same as pipe_surface_release, but used when pipe_context doesn't exist
+ * anymore.
+ */
+static inline void
+pipe_surface_release_no_context(struct pipe_surface **ptr)
+{
+   struct pipe_surface *surf = *ptr;
+
+   if (pipe_reference_described(&surf->reference, NULL,
+                                (debug_reference_descriptor)
+                                debug_describe_surface)) {
+      /* trivially destroy pipe_surface */
+      pipe_resource_reference(&surf->texture, NULL);
+      free(surf);
+   }
+   *ptr = NULL;
+}
+
 /**
  * Set *dst to \p src with proper reference counting.
  *
diff --git a/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c b/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c
index f1e96740231..9736aebd7f6 100644
--- a/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c
+++ b/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c
@@ -97,6 +97,7 @@ static void
 cmd_free(struct msm_cmd *cmd)
 {
 	fd_bo_del(cmd->ring_bo);
+	free(cmd->relocs);
 	free(cmd);
 }
 
@@ -655,6 +656,7 @@ msm_ringbuffer_destroy(struct fd_ringbuffer *ring)
 
 		_mesa_set_destroy(msm_ring->u.ring_set, unref_rings);
 
+		free(msm_ring->u.reloc_bos);
 		free(msm_ring);
 	} else {
 		struct fd_submit *submit = msm_ring->u.submit;
@@ -663,6 +665,7 @@ msm_ringbuffer_destroy(struct fd_ringbuffer *ring)
 			cmd_free(msm_ring->u.cmds[i]);
 		}
 
+		free(msm_ring->u.cmds);
 		slab_free_st(&to_msm_submit(submit)->ring_pool, msm_ring);
 	}
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 8767e5efb99..ca0192a9cc0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -370,7 +370,8 @@ class ConstantFolding : public Pass
 
    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
-   void opnd(Instruction *, ImmediateValue&, int s);
+   /* true if i was deleted */
+   bool opnd(Instruction *i, ImmediateValue&, int s);
    void opnd3(Instruction *, ImmediateValue&);
 
    void unary(Instruction *, const ImmediateValue&);
@@ -414,18 +415,21 @@ ConstantFolding::visit(BasicBlock *bb)
       if (i->srcExists(2) &&
           i->src(0).getImmediate(src0) &&
           i->src(1).getImmediate(src1) &&
-          i->src(2).getImmediate(src2))
+          i->src(2).getImmediate(src2)) {
          expr(i, src0, src1, src2);
-      else
+      } else
       if (i->srcExists(1) &&
-          i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
+          i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) {
          expr(i, src0, src1);
-      else
-      if (i->srcExists(0) && i->src(0).getImmediate(src0))
-         opnd(i, src0, 0);
-      else
-      if (i->srcExists(1) && i->src(1).getImmediate(src1))
-         opnd(i, src1, 1);
+      } else
+      if (i->srcExists(0) && i->src(0).getImmediate(src0)) {
+         if (opnd(i, src0, 0))
+            continue;
+      } else
+      if (i->srcExists(1) && i->src(1).getImmediate(src1)) {
+         if (opnd(i, src1, 1))
+            continue;
+      }
       if (i->srcExists(2) && i->src(2).getImmediate(src2))
          opnd3(i, src2);
    }
@@ -1010,12 +1014,13 @@ ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *
    return false;
 }
 
-void
+bool
 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 {
    const int t = !s;
    const operation op = i->op;
    Instruction *newi = i;
+   bool deleted = false;
 
    switch (i->op) {
    case OP_SPLIT: {
@@ -1035,6 +1040,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          val >>= bitsize;
       }
       delete_Instruction(prog, i);
+      deleted = true;
       break;
    }
    case OP_MUL:
@@ -1049,6 +1055,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
             delete_Instruction(prog, i);
+            deleted = true;
          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
             // The high bits can't be set in this case (either mul by 0 or
             // unsigned by 1)
@@ -1099,8 +1106,10 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       if (!isFloatType(i->dType) && !i->src(t).mod) {
          bld.setPosition(i, false);
          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
-         if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL))
+         if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL)) {
             delete_Instruction(prog, i);
+            deleted = true;
+         }
       } else
       if (i->postFactor && i->sType == TYPE_F32) {
          /* Can't emit a postfactor with an immediate, have to fold it in */
@@ -1136,8 +1145,10 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
          bld.setPosition(i, false);
          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
-         if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2)))
+         if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2))) {
             delete_Instruction(prog, i);
+            deleted = true;
+         }
       }
       break;
    case OP_SUB:
@@ -1207,6 +1218,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
 
          delete_Instruction(prog, i);
+         deleted = true;
       } else
       if (imm0.reg.data.s32 == -1) {
          i->op = OP_NEG;
@@ -1239,6 +1251,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
 
          delete_Instruction(prog, i);
+         deleted = true;
       }
       break;
 
@@ -1270,6 +1283,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
             newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
 
             delete_Instruction(prog, i);
+            deleted = true;
          }
       } else if (s == 1) {
          // In this case, we still want the optimized lowering that we get
@@ -1286,6 +1300,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
 
          delete_Instruction(prog, i);
+         deleted = true;
       }
       break;
 
@@ -1298,7 +1313,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
       CondCode cc, ccZ;
       if (imm0.reg.data.u32 != 0 || !si)
-         return;
+         return false;
       cc = si->setCond;
       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
@@ -1324,7 +1339,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       case CC_GT: break; // bool > 0 -- bool
       case CC_NE: break; // bool != 0 -- bool
       default:
-         return;
+         return false;
       }
 
       // Update the condition of this SET to be identical to the origin set,
@@ -1359,13 +1374,13 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       } else if (src->asCmp()) {
          CmpInstruction *cmp = src->asCmp();
          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
-            return;
+            return false;
          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
-            return;
+            return false;
          if (imm0.reg.data.f32 != 1.0)
-            return;
+            return false;
          if (cmp->dType != TYPE_U32)
-            return;
+            return false;
 
          cmp->dType = TYPE_F32;
          if (i->src(t).mod != Modifier(0)) {
@@ -1432,13 +1447,13 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       case OP_MUL:
          int muls;
          if (isFloatType(si->dType))
-            return;
+            return false;
          if (si->src(1).getImmediate(imm1))
             muls = 1;
          else if (si->src(0).getImmediate(imm1))
             muls = 0;
          else
-            return;
+            return false;
 
          bld.setPosition(i, false);
          i->op = OP_MUL;
@@ -1449,15 +1464,15 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       case OP_ADD:
          int adds;
          if (isFloatType(si->dType))
-            return;
+            return false;
          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
             adds = 0;
          else if (si->src(1).getImmediate(imm1))
             adds = 1;
          else
-            return;
+            return false;
          if (si->src(!adds).mod != Modifier(0))
-            return;
+            return false;
          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
 
          // This is more operations, but if one of x, y is an immediate, then
@@ -1472,7 +1487,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
                                      bld.mkImm(imm0.reg.data.u32)));
          break;
       default:
-         return;
+         return false;
       }
    }
       break;
@@ -1497,7 +1512,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
       default:
-         return;
+         return false;
       }
       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
          res = 31 - res;
@@ -1523,11 +1538,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
       // TODO: handle 64-bit values properly
       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
-         return;
+         return false;
 
       // TODO: handle single byte/word extractions
       if (i->subOp)
-         return;
+         return false;
 
       bld.setPosition(i, true); /* make sure bld is init'ed */
 
@@ -1564,7 +1579,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
                         CLAMP(imm0.reg.data.u16, umin, umax) : \
                         imm0.reg.data.u16; \
          break; \
-      default: return; \
+      default: return false; \
       } \
       i->setSrc(0, bld.mkImm(res.data.dst)); \
       break
@@ -1591,7 +1606,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
          default:
-            return;
+            return false;
          }
          i->setSrc(0, bld.mkImm(res.data.f32));
          break;
@@ -1612,12 +1627,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
          default:
-            return;
+            return false;
          }
          i->setSrc(0, bld.mkImm(res.data.f64));
          break;
       default:
-         return;
+         return false;
       }
 #undef CASE
 
@@ -1628,7 +1643,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       break;
    }
    default:
-      return;
+      return false;
    }
 
    // This can get left behind some of the optimizations which simplify
@@ -1643,6 +1658,7 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
    if (newi->op != op)
       foldCount++;
+   return deleted;
 }
 
 // =============================================================================
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index fb4a259ce16..e1b2e20810a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -600,25 +600,23 @@ static inline void
 nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
                                unsigned nr, void **hwcso)
 {
+   unsigned highest_found = 0;
    unsigned i;
 
    assert(nr <= PIPE_MAX_SAMPLERS);
    for (i = 0; i < nr; ++i) {
       struct nv50_tsc_entry *old = nv50->samplers[s][i];
 
+      if (hwcso[i])
+         highest_found = i;
+
       nv50->samplers[s][i] = nv50_tsc_entry(hwcso[i]);
       if (old)
          nv50_screen_tsc_unlock(nv50->screen, old);
    }
    assert(nv50->num_samplers[s] <= PIPE_MAX_SAMPLERS);
-   for (; i < nv50->num_samplers[s]; ++i) {
-      if (nv50->samplers[s][i]) {
-         nv50_screen_tsc_unlock(nv50->screen, nv50->samplers[s][i]);
-         nv50->samplers[s][i] = NULL;
-      }
-   }
-
-   nv50->num_samplers[s] = nr;
+   if (nr >= nv50->num_samplers[s])
+      nv50->num_samplers[s] = highest_found + 1;
 
    nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index f2393cb27b5..9653de86fe9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -464,11 +464,15 @@ nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0,
                                unsigned s,
                                unsigned nr, void **hwcso)
 {
+   unsigned highest_found = 0;
    unsigned i;
 
    for (i = 0; i < nr; ++i) {
       struct nv50_tsc_entry *old = nvc0->samplers[s][i];
 
+      if (hwcso[i])
+         highest_found = i;
+
       if (hwcso[i] == old)
          continue;
       nvc0->samplers_dirty[s] |= 1 << i;
@@ -477,14 +481,8 @@ nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0,
       if (old)
          nvc0_screen_tsc_unlock(nvc0->screen, old);
    }
-   for (; i < nvc0->num_samplers[s]; ++i) {
-      if (nvc0->samplers[s][i]) {
-         nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
-         nvc0->samplers[s][i] = NULL;
-      }
-   }
-
-   nvc0->num_samplers[s] = nr;
+   if (nr >= nvc0->num_samplers[s])
+      nvc0->num_samplers[s] = highest_found + 1;
 }
 
 static void
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 2680396c3d6..41e83af1db1 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -105,6 +105,12 @@ static void r600_destroy_context(struct pipe_context *context)
 	}
 	util_unreference_framebuffer_state(&rctx->framebuffer.state);
 
+	if (rctx->gs_rings.gsvs_ring.buffer)
+		pipe_resource_reference(&rctx->gs_rings.gsvs_ring.buffer, NULL);
+
+	if (rctx->gs_rings.esgs_ring.buffer)
+		pipe_resource_reference(&rctx->gs_rings.esgs_ring.buffer, NULL);
+
 	for (sh = 0; sh < PIPE_SHADER_TYPES; ++sh)
 		for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; ++i)
 			rctx->b.b.set_constant_buffer(context, sh, i, NULL);
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index ccabab9cdb0..92f243b5c9a 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -1636,7 +1636,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
 	}
 
 	if (query->buffer.previous) {
-		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
+		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 256,
 				     &tmp_buffer_offset, &tmp_buffer);
 		if (!tmp_buffer)
 			return;
diff --git a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
index 5681fdc4425..b7d87eac9f4 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
@@ -567,7 +567,7 @@ int bc_builder::build_fetch_gds(fetch_node *n) {
 	const fetch_op_info *fop = bc.op_ptr;
 	unsigned gds_op = (ctx.fetch_opcode(bc.op) >> 8) & 0x3f;
 	unsigned mem_op = 4;
-	assert(fop->flags && FF_GDS);
+	assert(fop->flags & FF_GDS);
 
 	if (bc.op == FETCH_OP_TF_WRITE) {
 		mem_op = 5;
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index b440230d227..91f38329d59 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -580,10 +580,12 @@ static int si_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_CAP_SUPPORTED:
 			return (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
 				(si_vce_is_fw_version_supported(sscreen) ||
-				sscreen->info.family == CHIP_RAVEN)) ||
+				 sscreen->info.family == CHIP_RAVEN ||
+				 sscreen->info.family == CHIP_RAVEN2)) ||
 				(profile == PIPE_VIDEO_PROFILE_HEVC_MAIN &&
 				(sscreen->info.family == CHIP_RAVEN ||
-				si_radeon_uvd_enc_supported(sscreen)));
+				 sscreen->info.family == CHIP_RAVEN2 ||
+				 si_radeon_uvd_enc_supported(sscreen)));
 		case PIPE_VIDEO_CAP_NPOT_TEXTURES:
 			return 1;
 		case PIPE_VIDEO_CAP_MAX_WIDTH:
@@ -631,7 +633,8 @@ static int si_get_video_param(struct pipe_screen *screen,
 				return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 			return false;
 		case PIPE_VIDEO_FORMAT_JPEG:
-			if (sscreen->info.family == CHIP_RAVEN)
+			if (sscreen->info.family == CHIP_RAVEN ||
+			    sscreen->info.family == CHIP_RAVEN2)
 				return true;
 			if (sscreen->info.family < CHIP_CARRIZO || sscreen->info.family >= CHIP_VEGA10)
 				return false;
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 9b09c74d48a..7a2c7afdbfd 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -793,17 +793,10 @@ static void si_query_hw_do_emit_start(struct si_context *sctx,
 			emit_sample_streamout(cs, va + 32 * stream, stream);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
-		/* Write the timestamp from the CP not waiting for
-		 * outstanding draws (top-of-pipe).
-		 */
-		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-		radeon_emit(cs, COPY_DATA_COUNT_SEL |
-				COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
-				COPY_DATA_DST_SEL(COPY_DATA_DST_MEM));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
+		si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
+				  EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
+				  EOP_DATA_SEL_TIMESTAMP, NULL, va,
+				  0, query->b.type);
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 19522cc97b1..f1d5ad31365 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -561,6 +561,14 @@ void si_llvm_load_input_vs(
 
 	/* Do multiple loads for special formats. */
 	switch (fix_fetch) {
+	case SI_FIX_FETCH_RG_64_FLOAT:
+		num_fetches = 1; /* 1 2-dword or 4-dword load */
+		fetch_stride = 0;
+		if (util_last_bit(info->input_usage_mask[input_index]) >= 2)
+			num_channels = 4; /* 2 doubles in 4 dwords */
+		else
+			num_channels = 2; /* 1 double in 2 dwords */
+		break;
 	case SI_FIX_FETCH_RGB_64_FLOAT:
 		num_fetches = 3; /* 3 2-dword loads */
 		fetch_stride = 8;
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 87ca0161b45..cd38145daec 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -989,6 +989,9 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 			LLVMValueRef data[4];
 			unsigned loc = variable->data.location;
 
+			if (loc >= VARYING_SLOT_VAR0 && nir->info.stage == MESA_SHADER_FRAGMENT)
+				ctx->abi.fs_input_attr_indices[loc - VARYING_SLOT_VAR0] = input_idx / 4;
+
 			for (unsigned i = 0; i < attrib_count; i++) {
 				/* Packed components share the same location so skip
 				 * them if we have already processed the location.
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index b741bcadec8..e6d97fe6727 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -101,6 +101,10 @@ static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
 static const uint32_t sample_locs_8x[] = {
 	FILL_SREG(-3,-5,   5, 1,  -1, 3,   7,-7),
 	FILL_SREG(-7,-1,   3, 7,  -5, 5,   1,-3),
+	/* The following are unused by hardware, but we emit them to IBs
+	 * instead of multiple SET_CONTEXT_REG packets. */
+	0,
+	0,
 };
 static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
 
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c
index 1a9d8f8d9fa..8c9553acbf3 100644
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -146,7 +146,8 @@ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
 					       const struct pipe_video_codec *templ)
 {
 	struct si_context *ctx = (struct si_context *)context;
-	bool vcn = (ctx->family == CHIP_RAVEN) ? true : false;
+	bool vcn = ctx->family == CHIP_RAVEN ||
+		   ctx->family == CHIP_RAVEN2;
 
 	if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
 		if (vcn) {
diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
index 1d59dbfc12a..e8f0e291dc3 100644
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -32,6 +32,7 @@
 #include "util/u_format.h"
 #include "util/u_hash_table.h"
 #include "util/u_screen.h"
+#include "util/u_transfer_helper.h"
 #include "util/ralloc.h"
 
 #include <xf86drm.h>
@@ -74,6 +75,7 @@ v3d_screen_destroy(struct pipe_screen *pscreen)
                 v3d_simulator_destroy(screen);
 
         v3d_compiler_free(screen->compiler);
+        u_transfer_helper_destroy(pscreen->transfer_helper);
 
         close(screen->fd);
         ralloc_free(pscreen);
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 94784bbdc0a..41e6ec5c1cb 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -572,7 +572,15 @@ vc4_resource_create_with_modifiers(struct pipe_screen *pscreen,
                         goto fail;
         }
 
-        if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) {
+        /* Set up the "scanout resource" (the dmabuf export of our buffer to
+         * the KMS handle) if the buffer might ever have
+         * resource_get_handle(WINSYS_HANDLE_TYPE_KMS) called on it.
+         * create_with_modifiers() doesn't give us usage flags, so we have to
+         * assume that all calls with modifiers are scanout-possible.
+         */
+        if (screen->ro &&
+            ((tmpl->bind & PIPE_BIND_SCANOUT) ||
+             !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) {
                 rsc->scanout =
                         renderonly_scanout_for_resource(prsc, screen->ro, NULL);
                 if (!rsc->scanout)
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 14ee6cf09e5..e7f7c82c271 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -33,6 +33,7 @@
 #include "util/u_format.h"
 #include "util/u_hash_table.h"
 #include "util/u_screen.h"
+#include "util/u_transfer_helper.h"
 #include "util/ralloc.h"
 
 #include <xf86drm.h>
@@ -110,6 +111,8 @@ vc4_screen_destroy(struct pipe_screen *pscreen)
         vc4_simulator_destroy(screen);
 #endif
 
+        u_transfer_helper_destroy(pscreen->transfer_helper);
+
         close(screen->fd);
         ralloc_free(pscreen);
 }
diff --git a/src/gallium/drivers/virgl/virgl_buffer.c b/src/gallium/drivers/virgl/virgl_buffer.c
index 88a22b56f9a..f72c93f4995 100644
--- a/src/gallium/drivers/virgl/virgl_buffer.c
+++ b/src/gallium/drivers/virgl/virgl_buffer.c
@@ -106,7 +106,6 @@ static void virgl_buffer_transfer_unmap(struct pipe_context *ctx,
    if (trans->base.usage & PIPE_TRANSFER_WRITE) {
       if (!(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) {
          struct virgl_screen *vs = virgl_screen(ctx->screen);
-         vbuf->base.clean = FALSE;
          vctx->num_transfers++;
          vs->vws->transfer_put(vs->vws, vbuf->base.hw_res,
                                &transfer->box, trans->base.stride, trans->base.layer_stride, trans->offset, transfer->level);
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 4511bf3b2fb..61fb3f0636a 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -47,6 +47,12 @@
 #include "virgl_resource.h"
 #include "virgl_screen.h"
 
+struct virgl_vertex_elements_state {
+   uint32_t handle;
+   uint8_t binding_map[PIPE_MAX_ATTRIBS];
+   uint8_t num_bindings;
+};
+
 static uint32_t next_handle;
 uint32_t virgl_object_assign_handle(void)
 {
@@ -385,29 +391,54 @@ static void *virgl_create_vertex_elements_state(struct pipe_context *ctx,
                                                         unsigned num_elements,
                                                         const struct pipe_vertex_element *elements)
 {
+   struct pipe_vertex_element new_elements[PIPE_MAX_ATTRIBS];
    struct virgl_context *vctx = virgl_context(ctx);
-   uint32_t handle = virgl_object_assign_handle();
-   virgl_encoder_create_vertex_elements(vctx, handle,
-                                       num_elements, elements);
-   return (void*)(unsigned long)handle;
+   struct virgl_vertex_elements_state *state =
+      CALLOC_STRUCT(virgl_vertex_elements_state);
+
+   for (int i = 0; i < num_elements; ++i) {
+      if (elements[i].instance_divisor) {
+	 /* Virglrenderer doesn't deal with instance_divisor correctly if
+	  * there isn't a 1:1 relationship between elements and bindings.
+	  * So let's make sure there is, by duplicating bindings.
+	  */
+	 for (int j = 0; j < num_elements; ++j) {
+            new_elements[j] = elements[j];
+            new_elements[j].vertex_buffer_index = j;
+            state->binding_map[j] = elements[j].vertex_buffer_index;
+	 }
+	 elements = new_elements;
+	 state->num_bindings = num_elements;
+	 break;
+      }
+   }
 
+   state->handle = virgl_object_assign_handle();
+   virgl_encoder_create_vertex_elements(vctx, state->handle,
+                                       num_elements, elements);
+   return state;
 }
 
 static void virgl_delete_vertex_elements_state(struct pipe_context *ctx,
                                               void *ve)
 {
    struct virgl_context *vctx = virgl_context(ctx);
-   uint32_t handle = (unsigned long)ve;
-
-   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_VERTEX_ELEMENTS);
+   struct virgl_vertex_elements_state *state =
+      (struct virgl_vertex_elements_state *)ve;
+   virgl_encode_delete_object(vctx, state->handle, VIRGL_OBJECT_VERTEX_ELEMENTS);
+   FREE(state);
 }
 
 static void virgl_bind_vertex_elements_state(struct pipe_context *ctx,
                                                      void *ve)
 {
    struct virgl_context *vctx = virgl_context(ctx);
-   uint32_t handle = (unsigned long)ve;
-   virgl_encode_bind_object(vctx, handle, VIRGL_OBJECT_VERTEX_ELEMENTS);
+   struct virgl_vertex_elements_state *state =
+      (struct virgl_vertex_elements_state *)ve;
+   vctx->vertex_elements = state;
+   virgl_encode_bind_object(vctx, state ? state->handle : 0,
+                            VIRGL_OBJECT_VERTEX_ELEMENTS);
+   vctx->vertex_array_dirty = TRUE;
 }
 
 static void virgl_set_vertex_buffers(struct pipe_context *ctx,
@@ -429,7 +460,17 @@ static void virgl_hw_set_vertex_buffers(struct pipe_context *ctx)
    struct virgl_context *vctx = virgl_context(ctx);
 
    if (vctx->vertex_array_dirty) {
-      virgl_encoder_set_vertex_buffers(vctx, vctx->num_vertex_buffers, vctx->vertex_buffer);
+      struct virgl_vertex_elements_state *ve = vctx->vertex_elements;
+
+      if (ve->num_bindings) {
+         struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+         for (int i = 0; i < ve->num_bindings; ++i)
+            vertex_buffers[i] = vctx->vertex_buffer[ve->binding_map[i]];
+
+         virgl_encoder_set_vertex_buffers(vctx, ve->num_bindings, vertex_buffers);
+      } else
+         virgl_encoder_set_vertex_buffers(vctx, vctx->num_vertex_buffers, vctx->vertex_buffer);
+
       virgl_attach_res_vertex_buffers(vctx);
    }
 }
diff --git a/src/gallium/drivers/virgl/virgl_context.h b/src/gallium/drivers/virgl/virgl_context.h
index 20988baa3c7..09cf0db2ae4 100644
--- a/src/gallium/drivers/virgl/virgl_context.h
+++ b/src/gallium/drivers/virgl/virgl_context.h
@@ -32,6 +32,7 @@ struct pipe_screen;
 struct tgsi_token;
 struct u_upload_mgr;
 struct virgl_cmd_buf;
+struct virgl_vertex_elements_state;
 
 struct virgl_sampler_view {
    struct pipe_sampler_view base;
@@ -53,6 +54,7 @@ struct virgl_context {
    struct virgl_cmd_buf *cbuf;
 
    struct virgl_textures_info samplers[PIPE_SHADER_TYPES];
+   struct virgl_vertex_elements_state *vertex_elements;
 
    struct pipe_framebuffer_state framebuffer;
 
diff --git a/src/gallium/drivers/virgl/virgl_encode.c b/src/gallium/drivers/virgl/virgl_encode.c
index e86d0711a57..ee2764d74ea 100644
--- a/src/gallium/drivers/virgl/virgl_encode.c
+++ b/src/gallium/drivers/virgl/virgl_encode.c
@@ -61,6 +61,12 @@ static void virgl_encoder_write_res(struct virgl_context *ctx,
    }
 }
 
+static void virgl_dirty_res(struct virgl_resource *res)
+{
+   if (res)
+      res->clean = FALSE;
+}
+
 int virgl_encode_bind_object(struct virgl_context *ctx,
                             uint32_t handle, uint32_t object)
 {
@@ -615,6 +621,7 @@ int virgl_encode_sampler_view(struct virgl_context *ctx,
    if (res->u.b.target == PIPE_BUFFER) {
       virgl_encoder_write_dword(ctx->cbuf, state->u.buf.offset / elem_size);
       virgl_encoder_write_dword(ctx->cbuf, (state->u.buf.offset + state->u.buf.size) / elem_size - 1);
+      virgl_dirty_res(res);
    } else {
       virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_layer | state->u.tex.last_layer << 16);
       virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_level | state->u.tex.last_level << 8);
@@ -949,6 +956,7 @@ int virgl_encode_set_shader_buffers(struct virgl_context *ctx,
          virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_offset);
          virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_size);
          virgl_encoder_write_res(ctx, res);
+         virgl_dirty_res(res);
       } else {
          virgl_encoder_write_dword(ctx->cbuf, 0);
          virgl_encoder_write_dword(ctx->cbuf, 0);
@@ -972,6 +980,7 @@ int virgl_encode_set_hw_atomic_buffers(struct virgl_context *ctx,
          virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_offset);
          virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_size);
          virgl_encoder_write_res(ctx, res);
+         virgl_dirty_res(res);
       } else {
          virgl_encoder_write_dword(ctx->cbuf, 0);
          virgl_encoder_write_dword(ctx->cbuf, 0);
@@ -999,6 +1008,7 @@ int virgl_encode_set_shader_images(struct virgl_context *ctx,
          virgl_encoder_write_dword(ctx->cbuf, images[i].u.buf.offset);
          virgl_encoder_write_dword(ctx->cbuf, images[i].u.buf.size);
          virgl_encoder_write_res(ctx, res);
+         virgl_dirty_res(res);
       } else {
          virgl_encoder_write_dword(ctx->cbuf, 0);
          virgl_encoder_write_dword(ctx->cbuf, 0);
diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c
index db5e7dd61af..9174ec5cbbd 100644
--- a/src/gallium/drivers/virgl/virgl_resource.c
+++ b/src/gallium/drivers/virgl/virgl_resource.c
@@ -95,7 +95,11 @@ static void virgl_buffer_subdata(struct pipe_context *pipe,
       usage |= PIPE_TRANSFER_DISCARD_RANGE;
 
    u_box_1d(offset, size, &box);
-   virgl_transfer_inline_write(pipe, resource, 0, usage, &box, data, 0, 0);
+
+   if (size >= (VIRGL_MAX_CMDBUF_DWORDS * 4))
+      u_default_buffer_subdata(pipe, resource, usage, offset, size, data);
+   else
+      virgl_transfer_inline_write(pipe, resource, 0, usage, &box, data, 0, 0);
 }
 
 void virgl_init_context_resource_functions(struct pipe_context *ctx)
diff --git a/src/gallium/drivers/virgl/virgl_winsys.h b/src/gallium/drivers/virgl/virgl_winsys.h
index 0e6cb7953f6..b44f8aaa54a 100644
--- a/src/gallium/drivers/virgl/virgl_winsys.h
+++ b/src/gallium/drivers/virgl/virgl_winsys.h
@@ -31,7 +31,7 @@ struct pipe_fence_handle;
 struct winsys_handle;
 struct virgl_hw_res;
 
-#define VIRGL_MAX_CMDBUF_DWORDS (16*1024)
+#define VIRGL_MAX_CMDBUF_DWORDS (64 * 1024)
 
 struct virgl_drm_caps {
    union virgl_caps caps;
diff --git a/src/gallium/state_trackers/clover/llvm/compat.hpp b/src/gallium/state_trackers/clover/llvm/compat.hpp
index 975012cbda4..b91cb95a295 100644
--- a/src/gallium/state_trackers/clover/llvm/compat.hpp
+++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
@@ -58,9 +58,14 @@
 #include <llvm/Analysis/TargetLibraryInfo.h>
 
 #include <clang/Basic/TargetInfo.h>
-#include <clang/Frontend/CodeGenOptions.h>
 #include <clang/Frontend/CompilerInstance.h>
 
+#if HAVE_LLVM >= 0x0800
+#include <clang/Basic/CodeGenOptions.h>
+#else
+#include <clang/Frontend/CodeGenOptions.h>
+#endif
+
 namespace clover {
    namespace llvm {
       namespace compat {
diff --git a/src/gallium/state_trackers/clover/meson.build b/src/gallium/state_trackers/clover/meson.build
index 1a09d8f2ca9..a6729af2fb8 100644
--- a/src/gallium/state_trackers/clover/meson.build
+++ b/src/gallium/state_trackers/clover/meson.build
@@ -30,6 +30,7 @@ libcltgsi = static_library(
   files('tgsi/compiler.cpp', 'tgsi/invocation.hpp'),
   include_directories : clover_incs,
   cpp_args : [cpp_vis_args],
+  override_options : clover_cpp_std,
 )
 
 libclllvm = static_library(
@@ -56,6 +57,7 @@ libclllvm = static_library(
     )),
   ],
   dependencies : [dep_llvm, dep_elf],
+  override_options : clover_cpp_std,
 )
 
 clover_files = files(
@@ -119,4 +121,5 @@ libclover = static_library(
   include_directories : clover_incs,
   cpp_args : [clover_cpp_args, cpp_vis_args],
   link_with : [libcltgsi, libclllvm],
+  override_options : clover_cpp_std,
 )
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index a09787bb215..d99187c6eaa 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -176,6 +176,12 @@ static int convert_fourcc(int format, int *dri_components_p)
       format = __DRI_IMAGE_FORMAT_R8;
       dri_components = __DRI_IMAGE_COMPONENTS_Y_UV;
       break;
+   case __DRI_IMAGE_FOURCC_P010:
+   case __DRI_IMAGE_FOURCC_P012:
+   case __DRI_IMAGE_FOURCC_P016:
+      format = __DRI_IMAGE_FORMAT_R16;
+      dri_components = __DRI_IMAGE_COMPONENTS_Y_UV;
+      break;
    default:
       return -1;
    }
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 886f94dc02c..5a0d2e1354d 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -421,12 +421,19 @@ static const __DRIextension *drisw_screen_extensions[] = {
    NULL
 };
 
-static struct drisw_loader_funcs drisw_lf = {
+static const struct drisw_loader_funcs drisw_lf = {
    .get_image = drisw_get_image,
    .put_image = drisw_put_image,
    .put_image2 = drisw_put_image2
 };
 
+static const struct drisw_loader_funcs drisw_shm_lf = {
+   .get_image = drisw_get_image,
+   .put_image = drisw_put_image,
+   .put_image2 = drisw_put_image2,
+   .put_image_shm = drisw_put_image_shm
+};
+
 static const __DRIconfig **
 drisw_init_screen(__DRIscreen * sPriv)
 {
@@ -434,6 +441,7 @@ drisw_init_screen(__DRIscreen * sPriv)
    const __DRIconfig **configs;
    struct dri_screen *screen;
    struct pipe_screen *pscreen = NULL;
+   const struct drisw_loader_funcs *lf = &drisw_lf;
 
    screen = CALLOC_STRUCT(dri_screen);
    if (!screen)
@@ -448,10 +456,10 @@ drisw_init_screen(__DRIscreen * sPriv)
    sPriv->extensions = drisw_screen_extensions;
    if (loader->base.version >= 4) {
       if (loader->putImageShm)
-         drisw_lf.put_image_shm = drisw_put_image_shm;
+         lf = &drisw_shm_lf;
    }
 
-   if (pipe_loader_sw_probe_dri(&screen->dev, &drisw_lf)) {
+   if (pipe_loader_sw_probe_dri(&screen->dev, lf)) {
       dri_init_options(screen);
 
       pscreen = pipe_loader_create_screen(screen->dev);
diff --git a/src/gallium/state_trackers/nine/buffer9.h b/src/gallium/state_trackers/nine/buffer9.h
index b04a0a721bb..1803d8d6405 100644
--- a/src/gallium/state_trackers/nine/buffer9.h
+++ b/src/gallium/state_trackers/nine/buffer9.h
@@ -104,7 +104,9 @@ NineBuffer9_Upload( struct NineBuffer9 *This )
     struct NineDevice9 *device = This->base.base.device;
 
     assert(This->base.pool == D3DPOOL_MANAGED && This->managed.dirty);
-    nine_context_range_upload(device, &This->managed.pending_upload, This->base.resource,
+    nine_context_range_upload(device, &This->managed.pending_upload,
+                              (struct NineUnknown *)This,
+                              This->base.resource,
                               This->managed.dirty_box.x,
                               This->managed.dirty_box.width,
                               (char *)This->managed.data + This->managed.dirty_box.x);
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 74aaf57a549..c5596a5ee94 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -2418,6 +2418,7 @@ CSMT_ITEM_NO_WAIT(nine_context_gen_mipmap,
 }
 
 CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload,
+                               ARG_BIND_REF(struct NineUnknown, src_ref),
                                ARG_BIND_RES(struct pipe_resource, res),
                                ARG_VAL(unsigned, offset),
                                ARG_VAL(unsigned, size),
@@ -2425,11 +2426,14 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload,
 {
     struct nine_context *context = &device->context;
 
+    /* Binding src_ref avoids release before upload */
+    (void)src_ref;
+
     context->pipe->buffer_subdata(context->pipe, res, 0, offset, size, data);
 }
 
 CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_box_upload,
-                               ARG_BIND_REF(struct NineUnknown, dst),
+                               ARG_BIND_REF(struct NineUnknown, src_ref),
                                ARG_BIND_RES(struct pipe_resource, res),
                                ARG_VAL(unsigned, level),
                                ARG_COPY_REF(struct pipe_box, dst_box),
@@ -2444,8 +2448,8 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_box_upload,
     struct pipe_transfer *transfer = NULL;
     uint8_t *map;
 
-    /* We just bind dst for the bind count */
-    (void)dst;
+    /* Binding src_ref avoids release before upload */
+    (void)src_ref;
 
     map = pipe->transfer_map(pipe,
                              res,
diff --git a/src/gallium/state_trackers/nine/nine_state.h b/src/gallium/state_trackers/nine/nine_state.h
index 51e5e326527..55960007bfb 100644
--- a/src/gallium/state_trackers/nine/nine_state.h
+++ b/src/gallium/state_trackers/nine/nine_state.h
@@ -560,6 +560,7 @@ nine_context_gen_mipmap(struct NineDevice9 *device,
 void
 nine_context_range_upload(struct NineDevice9 *device,
                           unsigned *counter,
+                          struct NineUnknown *src_ref,
                           struct pipe_resource *res,
                           unsigned offset,
                           unsigned size,
@@ -568,7 +569,7 @@ nine_context_range_upload(struct NineDevice9 *device,
 void
 nine_context_box_upload(struct NineDevice9 *device,
                         unsigned *counter,
-                        struct NineUnknown *dst,
+                        struct NineUnknown *src_ref,
                         struct pipe_resource *res,
                         unsigned level,
                         const struct pipe_box *dst_box,
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index 5fd662fa049..10518219a0a 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -660,7 +660,7 @@ NineSurface9_CopyMemToDefault( struct NineSurface9 *This,
 
     nine_context_box_upload(This->base.base.device,
                             &From->pending_uploads_counter,
-                            (struct NineUnknown *)This,
+                            (struct NineUnknown *)From,
                             r_dst,
                             This->level,
                             &dst_box,
diff --git a/src/gallium/state_trackers/nine/threadpool.c b/src/gallium/state_trackers/nine/threadpool.c
index cc62fd25799..19721aab2dd 100644
--- a/src/gallium/state_trackers/nine/threadpool.c
+++ b/src/gallium/state_trackers/nine/threadpool.c
@@ -37,6 +37,7 @@
 #include "os/os_thread.h"
 #include "threadpool.h"
 
+/* POSIX thread function */
 static void *
 threadpool_worker(void *data)
 {
@@ -76,6 +77,15 @@ threadpool_worker(void *data)
     return NULL;
 }
 
+/* Windows thread function */
+static DWORD NINE_WINAPI
+wthreadpool_worker(void *data)
+{
+    threadpool_worker(data);
+
+    return 0;
+}
+
 struct threadpool *
 _mesa_threadpool_create(struct NineSwapChain9 *swapchain)
 {
@@ -87,7 +97,9 @@ _mesa_threadpool_create(struct NineSwapChain9 *swapchain)
     pthread_mutex_init(&pool->m, NULL);
     pthread_cond_init(&pool->new_work, NULL);
 
-    pool->wthread = NineSwapChain9_CreateThread(swapchain, threadpool_worker, pool);
+    /* This uses WINE's CreateThread, so the thread function needs to use
+     * the Windows ABI */
+    pool->wthread = NineSwapChain9_CreateThread(swapchain, wthreadpool_worker, pool);
     if (!pool->wthread) {
         /* using pthread as fallback */
         pthread_create(&pool->pthread, NULL, threadpool_worker, pool);
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index ec811aeba13..840f01dae10 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -449,7 +449,7 @@ NineVolume9_CopyMemToDefault( struct NineVolume9 *This,
 
     nine_context_box_upload(This->base.device,
                             &From->pending_uploads_counter,
-                            (struct NineUnknown *)This,
+                            (struct NineUnknown *)From,
                             r_dst,
                             This->level,
                             &dst_box,
diff --git a/src/gallium/state_trackers/nine/volumetexture9.c b/src/gallium/state_trackers/nine/volumetexture9.c
index 5dec4844864..c7191bce688 100644
--- a/src/gallium/state_trackers/nine/volumetexture9.c
+++ b/src/gallium/state_trackers/nine/volumetexture9.c
@@ -141,7 +141,8 @@ NineVolumeTexture9_dtor( struct NineVolumeTexture9 *This )
 
     if (This->volumes) {
         for (l = 0; l <= This->base.base.info.last_level; ++l)
-            NineUnknown_Destroy(&This->volumes[l]->base);
+            if (This->volumes[l])
+                NineUnknown_Destroy(&This->volumes[l]->base);
         FREE(This->volumes);
     }
 
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 5376be28531..9646427ea5f 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -598,10 +598,8 @@ surface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
    return VA_STATUS_SUCCESS;
 
 fail:
-   for (i = 0; i < VL_NUM_COMPONENTS; i++) {
-      if (resources[i])
-         pscreen->resource_destroy(pscreen, resources[i]);
-   }
+   for (i = 0; i < VL_NUM_COMPONENTS; i++)
+      pipe_resource_reference(&resources[i], NULL);
    return result;
 }
 
diff --git a/src/gallium/state_trackers/xa/xa_context.c b/src/gallium/state_trackers/xa/xa_context.c
index ba220877c84..67d9eac53bb 100644
--- a/src/gallium/state_trackers/xa/xa_context.c
+++ b/src/gallium/state_trackers/xa/xa_context.c
@@ -91,6 +91,7 @@ xa_context_destroy(struct xa_context *r)
     }
 
     r->pipe->destroy(r->pipe);
+    free(r);
 }
 
 XA_EXPORT int
diff --git a/src/gallium/state_trackers/xvmc/Makefile.am b/src/gallium/state_trackers/xvmc/Makefile.am
index 85d0b5f4953..dc278099030 100644
--- a/src/gallium/state_trackers/xvmc/Makefile.am
+++ b/src/gallium/state_trackers/xvmc/Makefile.am
@@ -27,6 +27,7 @@ AM_CFLAGS = \
 	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CFLAGS) \
 	$(VL_CFLAGS) \
+	$(X11_INCLUDES) \
 	$(XCB_DRI3_CFLAGS) \
 	$(XVMC_CFLAGS)
 
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index bd05b4f9692..bc72b1110a0 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -53,7 +53,7 @@ libgallium_nine = shared_library(
     libswkmsdri,
   ],
   dependencies : [
-    dep_selinux, dep_expat, dep_libdrm, dep_llvm,
+    dep_selinux, dep_expat, dep_libdrm, dep_llvm, dep_thread,
     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
     driver_i915, driver_svga,
   ],
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 9c43fa1e8fd..83f439071f8 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -43,9 +43,17 @@ LOCAL_SHARED_LIBRARIES := \
 	libbacktrace \
 	libdl \
 	libglapi \
-	libexpat \
 	libz
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+else
+LOCAL_SHARED_LIBRARIES += \
+	libexpat
+endif
+
 $(foreach d, $(MESA_BUILD_GALLIUM), $(eval LOCAL_CFLAGS += $(patsubst HAVE_%,-D%,$(d))))
 
 # sort GALLIUM_LIBS to remove any duplicates
diff --git a/src/gallium/targets/pipe-loader/meson.build b/src/gallium/targets/pipe-loader/meson.build
index 5a44102a69d..e9454d5666a 100644
--- a/src/gallium/targets/pipe-loader/meson.build
+++ b/src/gallium/targets/pipe-loader/meson.build
@@ -31,7 +31,7 @@ if (with_gallium_va or with_gallium_vdpau or with_gallium_omx != 'disabled' or
     with_gallium_xvmc or with_dri)
   pipe_loader_link_with += libgalliumvl
 else
-  pipe_loader_link_with += libgalliumvl_stubs
+  pipe_loader_link_with += libgalliumvl_stub
 endif
 if (with_gallium_va or with_gallium_vdpau or with_gallium_omx != 'disabled' or
     with_gallium_xvmc)
diff --git a/src/gallium/targets/vdpau/Makefile.am b/src/gallium/targets/vdpau/Makefile.am
index cd05a024451..2742c7acd44 100644
--- a/src/gallium/targets/vdpau/Makefile.am
+++ b/src/gallium/targets/vdpau/Makefile.am
@@ -57,8 +57,6 @@ include $(top_srcdir)/src/gallium/drivers/r300/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/r600/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/radeonsi/Automake.inc
 
-include $(top_srcdir)/src/gallium/drivers/tegra/Automake.inc
-
 if HAVE_GALLIUM_STATIC_TARGETS
 
 libvdpau_gallium_la_SOURCES += target.c
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 68f0562a644..f108058052d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -1310,6 +1310,12 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
    if (bo) {
       p_atomic_inc(&bo->base.reference.count);
       simple_mtx_unlock(&ws->bo_export_table_lock);
+
+      /* Release the buffer handle, because we don't need it anymore.
+       * This function is returning an existing buffer, which has its own
+       * handle.
+       */
+      amdgpu_bo_free(result.buf_handle);
       return &bo->base;
    }
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index f32bbd9d086..b20d702670d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -280,6 +280,12 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
    if (ws) {
       pipe_reference(NULL, &ws->reference);
       simple_mtx_unlock(&dev_tab_mutex);
+
+      /* Release the device handle, because we don't need it anymore.
+       * This function is returning an existing winsys instance, which
+       * has its own device handle.
+       */
+      amdgpu_device_deinitialize(dev);
       return &ws->base;
    }
 
diff --git a/src/gallium/winsys/i915/drm/Android.mk b/src/gallium/winsys/i915/drm/Android.mk
index bab3e85c5dd..bc8cd0ebe2e 100644
--- a/src/gallium/winsys/i915/drm/Android.mk
+++ b/src/gallium/winsys/i915/drm/Android.mk
@@ -30,7 +30,7 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_SHARED_LIBRARIES := libdrm_intel
+LOCAL_SHARED_LIBRARIES := libdrm_intel_pri
 LOCAL_MODULE := libmesa_winsys_i915
 
 include $(GALLIUM_COMMON_MK)
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 739e4ea131f..0ec8c1abe11 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -1198,4 +1198,6 @@ void
 vmw_ioctl_cleanup(struct vmw_winsys_screen *vws)
 {
    VMW_FUNC;
+
+   free(vws->ioctl.cap_3d);
 }
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.c b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
index d519bcfedd3..cd44b036c6f 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -62,7 +62,7 @@ struct dri_sw_winsys
 {
    struct sw_winsys base;
 
-   struct drisw_loader_funcs *lf;
+   const struct drisw_loader_funcs *lf;
 };
 
 static inline struct dri_sw_displaytarget *
@@ -282,7 +282,7 @@ dri_destroy_sw_winsys(struct sw_winsys *winsys)
 }
 
 struct sw_winsys *
-dri_create_sw_winsys(struct drisw_loader_funcs *lf)
+dri_create_sw_winsys(const struct drisw_loader_funcs *lf)
 {
    struct dri_sw_winsys *ws;
 
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.h b/src/gallium/winsys/sw/dri/dri_sw_winsys.h
index 329ac06a05b..47e3777d4cd 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.h
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.h
@@ -33,6 +33,6 @@
 
 struct sw_winsys;
 
-struct sw_winsys *dri_create_sw_winsys(struct drisw_loader_funcs *lf);
+struct sw_winsys *dri_create_sw_winsys(const struct drisw_loader_funcs *lf);
 
 #endif
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index a589f694bb0..176d04388f2 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -559,7 +559,7 @@ virgl_cs_create_fence(struct virgl_winsys *vws)
    res = virgl_vtest_winsys_resource_cache_create(vws,
                                                 PIPE_BUFFER,
                                                 PIPE_FORMAT_R8_UNORM,
-                                                PIPE_BIND_CUSTOM,
+                                                VIRGL_BIND_CUSTOM,
                                                 8, 1, 1, 0, 0, 0, 8);
 
    return (struct pipe_fence_handle *)res;
@@ -639,7 +639,7 @@ static void virgl_vtest_flush_frontbuffer(struct virgl_winsys *vws,
     * get the data. */
    virgl_vtest_recv_transfer_get_data(vtws, map + offset, size, valid_stride,
                                       &box, res->format,
-                                      util_format_get_stride(res->format, res->width));
+                                      vtws->protocol_version == 0 ? valid_stride : util_format_get_stride(res->format, res->width));
 
    vtws->sws->displaytarget_unmap(vtws->sws, res->dt);
 
diff --git a/src/gbm/meson.build b/src/gbm/meson.build
index 2e9d380c0b4..719f9c1a9b8 100644
--- a/src/gbm/meson.build
+++ b/src/gbm/meson.build
@@ -32,7 +32,6 @@ args_gbm = []
 deps_gbm = []
 incs_gbm = [
   include_directories('main'), inc_include, inc_src, inc_loader,
-  inc_wayland_drm,
 ]
 
 if with_dri2
diff --git a/src/glx/Makefile.am b/src/glx/Makefile.am
index 8f9d80c9f41..d06ae2972e9 100644
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -24,10 +24,6 @@ SUBDIRS =
 
 EXTRA_DIST = SConscript meson.build
 
-if HAVE_XF86VIDMODE
-EXTRA_DEFINES_XF86VIDMODE = -DXF86VIDMODE
-endif
-
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/include/GL/internal \
@@ -38,7 +34,6 @@ AM_CFLAGS = \
 	-I$(top_builddir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
-	$(EXTRA_DEFINES_XF86VIDMODE) \
 	-D_REENTRANT \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
 	$(DEFINES) \
diff --git a/src/glx/SConscript b/src/glx/SConscript
index 8ce17715814..051f55b7669 100644
--- a/src/glx/SConscript
+++ b/src/glx/SConscript
@@ -36,10 +36,7 @@ env.Prepend(LIBS = [
 env.PkgUseModules('X11')
 env.PkgUseModules('XCB')
 env.PkgUseModules('DRM')
-
-if env['HAVE_XF86VIDMODE']:
-    env.Append(CPPDEFINES = ['XF86VIDMODE'])
-    env.PkgUseModules('XF86VIDMODE')
+env.PkgUseModules('XF86VIDMODE')
 
 sources = [
     'clientattrib.c',
diff --git a/src/glx/glxcmds.c b/src/glx/glxcmds.c
index 4db0228eaba..424008fd670 100644
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -46,9 +46,9 @@
 #include "util/debug.h"
 #else
 #include <sys/time.h>
-#ifdef XF86VIDMODE
+#ifndef GLX_USE_WINDOWSGL
 #include <X11/extensions/xf86vmode.h>
-#endif
+#endif /* GLX_USE_WINDOWSGL */
 #endif
 #endif
 
@@ -2071,7 +2071,7 @@ _X_HIDDEN GLboolean
 __glxGetMscRate(struct glx_screen *psc,
 		int32_t * numerator, int32_t * denominator)
 {
-#ifdef XF86VIDMODE
+#if !defined(GLX_USE_WINDOWSGL)
    XF86VidModeModeLine mode_line;
    int dot_clock;
    int i;
@@ -2118,7 +2118,6 @@ __glxGetMscRate(struct glx_screen *psc,
 
       return True;
    }
-   else
 #endif
 
    return False;
@@ -2145,7 +2144,7 @@ _X_HIDDEN GLboolean
 __glXGetMscRateOML(Display * dpy, GLXDrawable drawable,
                    int32_t * numerator, int32_t * denominator)
 {
-#if defined( GLX_DIRECT_RENDERING ) && defined( XF86VIDMODE )
+#if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL) && !defined(GLX_USE_WINDOWSGL)
    __GLXDRIdrawable *draw = GetGLXDRIDrawable(dpy, drawable);
 
    if (draw == NULL)
diff --git a/src/glx/meson.build b/src/glx/meson.build
index dd8ba60ad80..a61f959e800 100644
--- a/src/glx/meson.build
+++ b/src/glx/meson.build
@@ -137,10 +137,6 @@ gl_lib_cargs = [
   '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path),
 ]
 
-if dep_xxf86vm.found()
-  gl_lib_cargs += '-DHAVE_XF86VIDMODE'
-endif
-
 libglx = static_library(
   'glx',
   [files_libglx, glx_generated],
@@ -154,26 +150,22 @@ libglx = static_library(
     extra_libs_libglx,
   ],
   dependencies : [dep_libdrm, dep_dri2proto, dep_glproto, dep_x11, dep_glvnd],
-  build_by_default : false,
 )
 
-if with_glx == 'dri'
-  libgl = shared_library(
-    gl_lib_name,
-    [],
-    include_directories : [inc_common, inc_glapi, inc_loader, inc_gl_internal],
-    link_with : [libglapi_static, libglapi],
-    link_whole : libglx,
-    link_args : [ld_args_bsymbolic, ld_args_gc_sections, extra_ld_args_libgl],
-    dependencies : [
-      dep_libdrm, dep_dl, dep_m, dep_thread, dep_x11, dep_xcb_glx, dep_xcb,
-      dep_x11_xcb, dep_xcb_dri2, dep_xext, dep_xfixes, dep_xdamage,
-      extra_deps_libgl,
-    ],
-    version : gl_lib_version,
-    install : true,
-  )
-endif
+libgl = shared_library(
+  gl_lib_name,
+  [],
+  link_with : [libglapi_static, libglapi],
+  link_whole : libglx,
+  link_args : [ld_args_bsymbolic, ld_args_gc_sections, extra_ld_args_libgl],
+  dependencies : [
+    dep_libdrm, dep_dl, dep_m, dep_thread, dep_x11, dep_xcb_glx, dep_xcb,
+    dep_x11_xcb, dep_xcb_dri2, dep_xext, dep_xfixes, dep_xdamage, dep_xxf86vm,
+    extra_deps_libgl,
+  ],
+  version : gl_lib_version,
+  install : true,
+)
 
 if with_tests
   subdir('tests')
diff --git a/src/glx/tests/meson.build b/src/glx/tests/meson.build
index fd9d4d433b2..e59b42d19a6 100644
--- a/src/glx/tests/meson.build
+++ b/src/glx/tests/meson.build
@@ -33,6 +33,11 @@ if with_shared_glapi
     files_glx_test += files('query_renderer_implementation_unittest.cpp')
   endif
 
+  test(
+    'dispatch-index-check',
+    files('dispatch-index-check'),
+    suite : ['glx'],
+  )
   test(
     'glx-test',
     executable(
@@ -41,9 +46,9 @@ if with_shared_glapi
       link_with : [libglx, libglapi],
       include_directories : [
         inc_src, inc_include, inc_mesa, inc_mapi, inc_gl_internal,
-        include_directories('..'),
+        inc_glx,
       ],
-      dependencies : [dep_libdrm, dep_thread, idep_gtest]
-    )
+      dependencies : [dep_libdrm, dep_glproto, dep_thread, idep_gtest]
+    ),
   )
 endif
diff --git a/src/intel/Android.common.mk b/src/intel/Android.common.mk
index 12cea6e5472..12bd8947e2e 100644
--- a/src/intel/Android.common.mk
+++ b/src/intel/Android.common.mk
@@ -38,7 +38,17 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/mesa
 
-LOCAL_SHARED_LIBRARIES := libexpat libz
+LOCAL_SHARED_LIBRARIES := libz
+
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+LOCAL_HEADER_LIBRARIES += liblog_headers
+else
+LOCAL_SHARED_LIBRARIES += \
+	libexpat
+endif
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_genxml
 
diff --git a/src/intel/Android.compiler.mk b/src/intel/Android.compiler.mk
index c2b01221dfc..41af7b20b9c 100644
--- a/src/intel/Android.compiler.mk
+++ b/src/intel/Android.compiler.mk
@@ -28,7 +28,7 @@
 # ---------------------------------------
 
 include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
 LOCAL_MODULE := libmesa_intel_compiler
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
diff --git a/src/intel/Android.dev.mk b/src/intel/Android.dev.mk
index cd2ed66a176..3011ee232ed 100644
--- a/src/intel/Android.dev.mk
+++ b/src/intel/Android.dev.mk
@@ -33,5 +33,8 @@ LOCAL_C_INCLUDES := $(MESA_TOP)/include/drm-uapi
 
 LOCAL_SRC_FILES := $(DEV_FILES)
 
+LOCAL_CFLAGS := \
+           -Wno-gnu-variable-sized-type-not-at-end
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk
index db81fada277..f4fda3f86ce 100644
--- a/src/intel/Android.vulkan.mk
+++ b/src/intel/Android.vulkan.mk
@@ -38,7 +38,10 @@ VULKAN_COMMON_INCLUDES := \
 	$(MESA_TOP)/src/intel \
 	$(MESA_TOP)/include/drm-uapi \
 	$(MESA_TOP)/src/intel/vulkan \
-	frameworks/native/vulkan/include
+	frameworks/native/vulkan/include \
+	frameworks/native/libs/nativebase/include \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
 
 # libmesa_anv_entrypoints with header and dummy.c
 #
@@ -72,7 +75,9 @@ $(intermediates)/vulkan/anv_entrypoints.h: $(intermediates)/vulkan/dummy.c
 LOCAL_EXPORT_C_INCLUDE_DIRS := \
         $(intermediates)
 
-LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_SHARED_LIBRARIES := libdrm_pri
+
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -84,7 +89,7 @@ ANV_INCLUDES := \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_vulkan_common,,)/vulkan \
 	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_vulkan_util,,)/util
 
-ANV_SHARED_LIBRARIES := libdrm
+ANV_SHARED_LIBRARIES := libdrm_pri
 
 ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
 ANV_SHARED_LIBRARIES += libnativewindow
@@ -107,6 +112,8 @@ LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -120,13 +127,15 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(VULKAN_GEN75_FILES)
 LOCAL_CFLAGS := -DGEN_VERSIONx10=75
-
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
 LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -140,13 +149,15 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(VULKAN_GEN8_FILES)
 LOCAL_CFLAGS := -DGEN_VERSIONx10=80
-
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
 LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -160,13 +171,15 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(VULKAN_GEN9_FILES)
 LOCAL_CFLAGS := -DGEN_VERSIONx10=90
-
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
 LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -180,13 +193,15 @@ LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 
 LOCAL_SRC_FILES := $(VULKAN_GEN10_FILES)
 LOCAL_CFLAGS := -DGEN_VERSIONx10=100
-
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
 LOCAL_C_INCLUDES := $(ANV_INCLUDES)
 
 LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -207,6 +222,8 @@ LOCAL_WHOLE_STATIC_LIBRARIES := libmesa_anv_entrypoints libmesa_genxml
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -217,7 +234,7 @@ include $(BUILD_STATIC_LIBRARY)
 include $(CLEAR_VARS)
 LOCAL_MODULE := libmesa_vulkan_common
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
+LOCAL_CFLAGS += -Wno-error
 intermediates := $(call local-generated-sources-dir)
 
 LOCAL_SRC_FILES := $(VULKAN_FILES)
@@ -261,6 +278,8 @@ $(intermediates)/vulkan/anv_extensions.h:
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES)
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
@@ -310,5 +329,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 
 LOCAL_SHARED_LIBRARIES := $(ANV_SHARED_LIBRARIES) libz libsync liblog
 
+LOCAL_HEADER_LIBRARIES += libcutils_headers libhardware_headers
+
 include $(MESA_COMMON_MK)
 include $(BUILD_SHARED_LIBRARY)
diff --git a/src/intel/Makefile.compiler.am b/src/intel/Makefile.compiler.am
index cd7e6882fb9..7c33e35816b 100644
--- a/src/intel/Makefile.compiler.am
+++ b/src/intel/Makefile.compiler.am
@@ -64,6 +64,7 @@ COMPILER_TESTS = \
 	compiler/test_vf_float_conversions \
 	compiler/test_vec4_cmod_propagation \
 	compiler/test_vec4_copy_propagation \
+	compiler/test_vec4_dead_code_eliminate \
 	compiler/test_vec4_register_coalesce
 
 TESTS += $(COMPILER_TESTS)
@@ -97,6 +98,10 @@ compiler_test_vec4_cmod_propagation_SOURCES = \
 	compiler/test_vec4_cmod_propagation.cpp
 compiler_test_vec4_cmod_propagation_LDADD = $(TEST_LIBS)
 
+compiler_test_vec4_dead_code_eliminate_SOURCES = \
+	compiler/test_vec4_dead_code_eliminate.cpp
+compiler_test_vec4_dead_code_eliminate_LDADD = $(TEST_LIBS)
+
 # Strictly speaking this is neither a C++ test nor using gtest - we can address
 # address that at a later point. Until then, this allows us a to simplify things.
 compiler_test_eu_compact_SOURCES = \
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
index 065980616ec..01bea99d3d8 100644
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -1326,7 +1326,7 @@ blorp_emit_memcpy(struct blorp_batch *batch,
 static void
 blorp_emit_surface_state(struct blorp_batch *batch,
                          const struct brw_blorp_surface_info *surface,
-                         enum isl_aux_op op,
+                         enum isl_aux_op aux_op,
                          void *state, uint32_t state_offset,
                          const bool color_write_disables[4],
                          bool is_render_target)
@@ -1382,7 +1382,7 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                           surface->aux_addr, *aux_addr);
    }
 
-   if (surface->clear_color_addr.buffer) {
+   if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
 #if GEN_GEN >= 10
       assert((surface->clear_color_addr.offset & 0x3f) == 0);
       uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
@@ -1390,7 +1390,10 @@ blorp_emit_surface_state(struct blorp_batch *batch,
                           isl_dev->ss.clear_color_state_offset,
                           surface->clear_color_addr, *clear_addr);
 #elif GEN_GEN >= 7
-      if (op == ISL_AUX_OP_FULL_RESOLVE || op == ISL_AUX_OP_PARTIAL_RESOLVE) {
+      /* Fast clears just whack the AUX surface and don't actually use the
+       * clear color for anything.  We can avoid the MI memcpy on that case.
+       */
+      if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
          struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
          dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
          blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
diff --git a/src/intel/common/gen_batch_decoder.c b/src/intel/common/gen_batch_decoder.c
index 63f04627572..36ee7706e40 100644
--- a/src/intel/common/gen_batch_decoder.c
+++ b/src/intel/common/gen_batch_decoder.c
@@ -214,7 +214,7 @@ handle_state_base_address(struct gen_batch_decode_ctx *ctx, const uint32_t *p)
          surface_modify = iter.raw_value;
       } else if (strcmp(iter.name, "Dynamic State Base Address Modify Enable") == 0) {
          dynamic_modify = iter.raw_value;
-      } else if (strcmp(iter.name, "Insntruction Base Address Modify Enable") == 0) {
+      } else if (strcmp(iter.name, "Instruction Base Address Modify Enable") == 0) {
          instruction_modify = iter.raw_value;
       }
    }
diff --git a/src/intel/common/gen_debug.c b/src/intel/common/gen_debug.c
index a978f2f5818..8990d208207 100644
--- a/src/intel/common/gen_debug.c
+++ b/src/intel/common/gen_debug.c
@@ -85,6 +85,7 @@ static const struct debug_control debug_control[] = {
    { "nohiz",       DEBUG_NO_HIZ },
    { "color",       DEBUG_COLOR },
    { "reemit",      DEBUG_REEMIT },
+   { "heur32",      DEBUG_HEUR32 },
    { NULL,    0 }
 };
 
diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h
index 72d7ca20a39..c2ca2e2ebd6 100644
--- a/src/intel/common/gen_debug.h
+++ b/src/intel/common/gen_debug.h
@@ -83,6 +83,7 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_NO_HIZ              (1ull << 39)
 #define DEBUG_COLOR               (1ull << 40)
 #define DEBUG_REEMIT              (1ull << 41)
+#define DEBUG_HEUR32              (1ull << 42)
 
 /* These flags are not compatible with the disk shader cache */
 #define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -90,7 +91,7 @@ extern uint64_t INTEL_DEBUG;
 /* These flags may affect program generation */
 #define DEBUG_DISK_CACHE_MASK \
    (DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 |  DEBUG_SPILL_FS | \
-   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32)
+   DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_HEUR32)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index d8c9499065f..785acdb3343 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -38,6 +38,15 @@ struct ra_regs;
 struct nir_shader;
 struct brw_program;
 
+struct brw_simd32_heuristics_control {
+   bool grouped_sends_check;
+   int max_grouped_sends;
+   bool inst_count_check;
+   float inst_count_ratio;
+   bool mrt_check;
+   int max_mrts;
+};
+
 struct brw_compiler {
    const struct gen_device_info *devinfo;
 
@@ -118,6 +127,8 @@ struct brw_compiler {
     * whether nir_opt_large_constants will be run.
     */
    bool supports_shader_constants;
+
+   struct brw_simd32_heuristics_control simd32_heuristics_control;
 };
 
 /**
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 4630b83b1a0..2618e9c2e93 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -925,8 +925,8 @@ brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
    const struct gen_device_info *devinfo = p->devinfo;
 
    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
-    * To avoid the problems that causes, we use a <1,2,0> source region to read
-    * each element twice.
+    * To avoid the problems that causes, we use an <X,2,0> source region to
+    * read each element twice.
     */
    if (devinfo->gen == 7 && !devinfo->is_haswell &&
        brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
@@ -935,11 +935,8 @@ brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
         src0.type == BRW_REGISTER_TYPE_D ||
         src0.type == BRW_REGISTER_TYPE_UD) &&
        !has_scalar_region(src0)) {
-      assert(src0.vstride == BRW_VERTICAL_STRIDE_4 &&
-             src0.width == BRW_WIDTH_4 &&
-             src0.hstride == BRW_HORIZONTAL_STRIDE_1);
-
-      src0.vstride = BRW_VERTICAL_STRIDE_1;
+      assert(src0.vstride == src0.width + src0.hstride);
+      src0.vstride = src0.hstride;
       src0.width = BRW_WIDTH_2;
       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
    }
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 3e083723471..6826226e209 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -315,6 +315,24 @@ fs_inst::has_source_and_destination_hazard() const
        * may stomp all over it.
        */
       return true;
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      switch (src[1].ud) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         /* These can be implemented as a single Align1 region on all
+          * platforms, so there's never a hazard between source and
+          * destination.  C.f. fs_generator::generate_quad_swizzle().
+          */
+         return false;
+      default:
+         return !is_uniform(src[0]);
+      }
    default:
       /* The SIMD16 compressed instruction
        *
@@ -3853,6 +3871,9 @@ fs_visitor::lower_integer_multiplication()
             high.offset = inst->dst.offset % REG_SIZE;
 
             if (devinfo->gen >= 7) {
+               if (inst->src[1].abs)
+                  lower_src_modifiers(this, block, inst, 1);
+
                if (inst->src[1].file == IMM) {
                   ibld.MUL(low, inst->src[0],
                            brw_imm_uw(inst->src[1].ud & 0xffff));
@@ -3865,6 +3886,9 @@ fs_visitor::lower_integer_multiplication()
                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
                }
             } else {
+               if (inst->src[0].abs)
+                  lower_src_modifiers(this, block, inst, 0);
+
                ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
                         inst->src[1]);
                ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
@@ -3882,6 +3906,18 @@ fs_visitor::lower_integer_multiplication()
          }
 
       } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         /* According to the BDW+ BSpec page for the "Multiply Accumulate
+          * High" instruction:
+          *
+          *  "An added preliminary mov is required for source modification on
+          *   src1:
+          *      mov (8) r3.0<1>:d -r3<8;8,1>:d
+          *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
+          *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
+          */
+         if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs))
+            lower_src_modifiers(this, block, inst, 1);
+
          /* Should have been lowered to 8-wide. */
          assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
@@ -3897,8 +3933,6 @@ fs_visitor::lower_integer_multiplication()
              * On Gen8, the multiply instruction does a full 32x32-bit
              * multiply, but in order to do a 64-bit multiply we can simulate
              * the previous behavior and then use a MACH instruction.
-             *
-             * FINISHME: Don't use source modifiers on src1.
              */
             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
@@ -5534,9 +5568,14 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
       return MIN2(8, inst->exec_size);
 
-   case SHADER_OPCODE_QUAD_SWIZZLE:
-      return 8;
-
+   case SHADER_OPCODE_QUAD_SWIZZLE: {
+      const unsigned swiz = inst->src[1].ud;
+      return (is_uniform(inst->src[0]) ?
+                 get_fpu_lowered_simd_width(devinfo, inst) :
+              devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
+              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
+              get_fpu_lowered_simd_width(devinfo, inst));
+   }
    case SHADER_OPCODE_MOV_INDIRECT: {
       /* From IVB and HSW PRMs:
        *
@@ -5601,8 +5640,10 @@ needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
 static fs_reg
 emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
 {
+   assert(lbld.group() >= inst->group);
+
    /* Specified channel group from the source region. */
-   const fs_reg src = horiz_offset(inst->src[i], lbld.group());
+   const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
 
    if (needs_src_copy(lbld, inst, i)) {
       /* Builder of the right width to perform the copy avoiding uninitialized
@@ -5691,9 +5732,10 @@ emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
 {
    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
    assert(lbld_before.group() == lbld_after.group());
+   assert(lbld_after.group() >= inst->group);
 
    /* Specified channel group from the destination region. */
-   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group());
+   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
    const unsigned dst_size = inst->size_written /
       inst->dst.component_size(inst->exec_size);
 
@@ -7127,6 +7169,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
+   bool simd16_failed = false;
+   bool simd16_spilled = false;
 
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
@@ -7194,10 +7238,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                      shader_time_index16);
       v16.import_uniforms(&v8);
       if (!v16.run_fs(allow_spilling, use_rep_send)) {
+         simd16_failed = true;
          compiler->shader_perf_log(log_data,
                                    "SIMD16 shader failed to compile: %s",
                                    v16.fail_msg);
       } else {
+         simd16_spilled = v16.spilled_any_registers;
          simd16_cfg = v16.cfg;
          prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
          prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
@@ -7205,9 +7251,17 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    /* Currently, the compiler only supports SIMD32 on SNB+ */
+   const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
+   uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;
+
    if (v8.max_dispatch_width >= 32 && !use_rep_send &&
        compiler->devinfo->gen >= 6 &&
-       unlikely(INTEL_DEBUG & DEBUG_DO32)) {
+       (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+        (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+         !simd16_failed && !simd16_spilled &&
+         (!ctrl->mrt_check ||
+          (ctrl->mrt_check &&
+          u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
       /* Try a SIMD32 compile */
       fs_visitor v32(compiler, log_data, mem_ctx, key,
                      &prog_data->base, prog, shader, 32,
@@ -7218,9 +7272,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                    "SIMD32 shader failed to compile: %s",
                                    v32.fail_msg);
       } else {
-         simd32_cfg = v32.cfg;
-         prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
-         prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+         if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
+              v32.run_heuristic(ctrl)) {
+            simd32_cfg = v32.cfg;
+            prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
+            prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
+         }
       }
    }
 
@@ -7299,13 +7356,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    }
 
    if (simd32_cfg) {
-      prog_data->dispatch_32 = true;
-      prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
+      uint32_t offset = g.generate_code(simd32_cfg, 32);
+
+      if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
+          (unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
+           (!simd16_cfg ||
+            (simd16_cfg &&
+             (!ctrl->inst_count_check ||
+             (ctrl->inst_count_check &&
+             (float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
+         prog_data->dispatch_32 = true;
+         prog_data->prog_offset_32 = offset;
+      }
    }
 
    return g.get_assembly();
 }
 
+bool
+fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
+   int grouped_sends = 0;
+   int max_grouped_sends = 0;
+   bool pass = true;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
+         ++grouped_sends;
+      } else if (grouped_sends > 0) {
+         if (grouped_sends > max_grouped_sends) {
+            max_grouped_sends = grouped_sends;
+         }
+         grouped_sends = 0;
+      }
+   }
+
+   if (ctrl->grouped_sends_check) {
+      if (max_grouped_sends > ctrl->max_grouped_sends) {
+         pass = false;
+      }
+   }
+
+   return pass;
+}
+
 fs_reg *
 fs_visitor::emit_cs_work_group_id_setup()
 {
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index aba19d5ab2c..26868c10107 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -286,6 +286,8 @@ class fs_visitor : public backend_shader
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
+   bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
+
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
@@ -397,6 +399,7 @@ class fs_generator
 
    void enable_debug(const char *shader_name);
    int generate_code(const cfg_t *cfg, int dispatch_width);
+   int get_inst_count(int dispatch_width);
    const unsigned *get_assembly();
 
 private:
@@ -478,6 +481,10 @@ class fs_generator
                          struct brw_reg src,
                          struct brw_reg idx);
 
+   void generate_quad_swizzle(const fs_inst *inst,
+                              struct brw_reg dst, struct brw_reg src,
+                              unsigned swiz);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
@@ -489,6 +496,7 @@ class fs_generator
    struct brw_stage_prog_data * const prog_data;
 
    unsigned dispatch_width; /**< 8, 16 or 32 */
+   int inst_count[3]; /* for 8, 16 and 32 */
 
    exec_list discard_halt_patches;
    unsigned promoted_constants;
@@ -529,6 +537,25 @@ namespace brw {
          return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
       }
    }
+
+   /**
+    * Remove any modifiers from the \p i-th source region of the instruction,
+    * including negate, abs and any implicit type conversion to the execution
+    * type.  Instead any source modifiers will be implemented as a separate
+    * MOV instruction prior to the original instruction.
+    */
+   inline bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      const fs_builder ibld(v, block, inst);
+      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
+
+      ibld.MOV(tmp, inst->src[i]);
+      inst->src[i] = tmp;
+
+      return true;
+   }
 }
 
 void shuffle_from_32bit_read(const brw::fs_builder &bld,
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index ab34b63748e..a76e0f3a6b5 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -315,6 +315,16 @@ can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
    if (stride > 4)
       return false;
 
+   /* Bail if the channels of the source need to be aligned to the byte offset
+    * of the corresponding channel of the destination, and the provided stride
+    * would break this restriction.
+    */
+   if (has_dst_aligned_region_restriction(devinfo, inst) &&
+       !(type_sz(inst->src[arg].type) * stride ==
+           type_sz(inst->dst.type) * inst->dst.stride ||
+         stride == 0))
+      return false;
+
    /* 3-source instructions can only be Align16, which restricts what strides
     * they can take. They can only take a stride of 1 (the usual case), or 0
     * with a special "repctrl" bit. But the repctrl bit doesn't work for
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 08dd83dded7..4c452e1c38a 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
    }
 }
 
+void
+fs_generator::generate_quad_swizzle(const fs_inst *inst,
+                                    struct brw_reg dst, struct brw_reg src,
+                                    unsigned swiz)
+{
+   /* Requires a quad. */
+   assert(inst->exec_size >= 4);
+
+   if (src.file == BRW_IMMEDIATE_VALUE ||
+       has_scalar_region(src)) {
+      /* The value is uniform across all channels */
+      brw_MOV(p, dst, src);
+
+   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
+      /* This only works on 8-wide 32-bit values */
+      assert(inst->exec_size == 8);
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      struct brw_reg swiz_src = stride(src, 4, 4, 1);
+      swiz_src.swizzle = swiz;
+      brw_MOV(p, dst, swiz_src);
+
+   } else {
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
+
+      switch (swiz) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
+         break;
+
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
+         break;
+
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         assert(inst->exec_size == 4);
+         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
+         break;
+
+      default:
+         assert(inst->force_writemask_all);
+         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
+
+         for (unsigned c = 0; c < 4; c++) {
+            brw_inst *insn = brw_MOV(
+               p, stride(suboffset(dst, c),
+                         4 * inst->dst.stride, 1, 4 * inst->dst.stride),
+               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
+
+            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
+            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
+         }
+
+         break;
+      }
+   }
+}
+
 void
 fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
@@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_QUAD_SWIZZLE:
-         /* This only works on 8-wide 32-bit values */
-         assert(inst->exec_size == 8);
-         assert(type_sz(src[0].type) == 4);
-         assert(inst->force_writemask_all);
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
          assert(src[1].type == BRW_REGISTER_TYPE_UD);
-
-         if (src[0].file == BRW_IMMEDIATE_VALUE ||
-             (src[0].vstride == 0 && src[0].hstride == 0)) {
-            /* The value is uniform across all channels */
-            brw_MOV(p, dst, src[0]);
-         } else {
-            brw_set_default_access_mode(p, BRW_ALIGN_16);
-            struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
-            swiz_src.swizzle = inst->src[1].ud;
-            brw_MOV(p, dst, swiz_src);
-         }
+         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
          break;
 
       case SHADER_OPCODE_CLUSTER_BROADCAST: {
@@ -2486,6 +2538,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
                               fill_count, promoted_constants, before_size,
                               after_size);
 
+   inst_count[ffs(dispatch_width) - 4] = before_size / 16;
+
    return start_offset;
 }
 
@@ -2494,3 +2548,13 @@ fs_generator::get_assembly()
 {
    return brw_get_program(p, &prog_data->program_size);
 }
+
+int
+fs_generator::get_inst_count(int dispatch_width)
+{
+   if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
+      return inst_count[ffs(dispatch_width) - 4];
+   } else {
+      return 0;
+   }
+}
\ No newline at end of file
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index c845d87d59b..c33394d10d4 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4804,7 +4804,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_begin_fragment_shader_ordering:
    case nir_intrinsic_begin_invocation_interlock: {
       const fs_builder ubld = bld.group(8, 0);
       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 07e7224e0f8..95b069a2e02 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -486,4 +486,32 @@ get_exec_type_size(const fs_inst *inst)
    return type_sz(get_exec_type(inst));
 }
 
+/**
+ * Return whether the following regioning restriction applies to the specified
+ * instruction.  From the Cherryview PRM Vol 7. "Register Region
+ * Restrictions":
+ *
+ * "When source or destination datatype is 64b or operation is integer DWord
+ *  multiply, regioning in Align1 must follow these rules:
+ *
+ *  1. Source and Destination horizontal stride must be aligned to the same qword.
+ *  2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
+ *  3. Source and Destination offset must be the same, except the case of
+ *     scalar source."
+ */
+static inline bool
+has_dst_aligned_region_restriction(const gen_device_info *devinfo,
+                                   const fs_inst *inst)
+{
+   const brw_reg_type exec_type = get_exec_type(inst);
+   const bool is_int_multiply = !brw_reg_type_is_floating_point(exec_type) &&
+         (inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD);
+
+   if (type_sz(inst->dst.type) > 4 || type_sz(exec_type) > 4 ||
+       (type_sz(exec_type) == 4 && is_int_multiply))
+      return devinfo->is_cherryview || gen_device_info_is_9lp(devinfo);
+   else
+      return false;
+}
+
 #endif
diff --git a/src/intel/compiler/brw_nir_opt_peephole_ffma.c b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
index cc225e1847b..7271bdbca43 100644
--- a/src/intel/compiler/brw_nir_opt_peephole_ffma.c
+++ b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
@@ -68,7 +68,7 @@ are_all_uses_fadd(nir_ssa_def *def)
 }
 
 static nir_alu_instr *
-get_mul_for_src(nir_alu_src *src, int num_components,
+get_mul_for_src(nir_alu_src *src, unsigned num_components,
                 uint8_t swizzle[4], bool *negate, bool *abs)
 {
    uint8_t swizzle_tmp[4];
@@ -93,16 +93,19 @@ get_mul_for_src(nir_alu_src *src, int num_components,
    switch (alu->op) {
    case nir_op_imov:
    case nir_op_fmov:
-      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], alu->dest.dest.ssa.num_components,
+                            swizzle, negate, abs);
       break;
 
    case nir_op_fneg:
-      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], alu->dest.dest.ssa.num_components,
+                            swizzle, negate, abs);
       *negate = !*negate;
       break;
 
    case nir_op_fabs:
-      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], alu->dest.dest.ssa.num_components,
+                            swizzle, negate, abs);
       *negate = false;
       *abs = true;
       break;
diff --git a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
index c09a3d7ebe9..99e4c9cacaf 100644
--- a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
+++ b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
@@ -81,17 +81,46 @@ vec4_visitor::dead_code_eliminate()
                result_live[3] = result;
             }
 
-            for (int c = 0; c < 4; c++) {
-               if (!result_live[c] && inst->dst.writemask & (1 << c)) {
-                  inst->dst.writemask &= ~(1 << c);
+            if (inst->writes_flag()) {
+               /* Independently calculate the usage of the flag components and
+                * the destination value components.
+                */
+               uint8_t flag_mask = inst->dst.writemask;
+               uint8_t dest_mask = inst->dst.writemask;
+
+               for (int c = 0; c < 4; c++) {
+                  if (!result_live[c] && dest_mask & (1 << c))
+                     dest_mask &= ~(1 << c);
+
+                  if (!BITSET_TEST(flag_live, c))
+                     flag_mask &= ~(1 << c);
+               }
+
+               if (inst->dst.writemask != (flag_mask | dest_mask)) {
                   progress = true;
+                  inst->dst.writemask = flag_mask | dest_mask;
+               }
 
-                  if (inst->dst.writemask == 0) {
-                     if (inst->writes_accumulator || inst->writes_flag()) {
-                        inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-                     } else {
-                        inst->opcode = BRW_OPCODE_NOP;
-                        break;
+               /* If none of the destination components are read, replace the
+                * destination register with the NULL register.
+                */
+               if (dest_mask == 0) {
+                  progress = true;
+                  inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+               }
+            } else {
+               for (int c = 0; c < 4; c++) {
+                  if (!result_live[c] && inst->dst.writemask & (1 << c)) {
+                     inst->dst.writemask &= ~(1 << c);
+                     progress = true;
+
+                     if (inst->dst.writemask == 0) {
+                        if (inst->writes_accumulator) {
+                           inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+                        } else {
+                           inst->opcode = BRW_OPCODE_NOP;
+                           break;
+                        }
                      }
                   }
                }
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 3cdeb6214a8..f2854be779a 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -145,7 +145,8 @@ if with_tests
   foreach t : ['fs_cmod_propagation', 'fs_copy_propagation',
                'fs_saturate_propagation', 'vf_float_conversions',
                'vec4_register_coalesce', 'vec4_copy_propagation',
-               'vec4_cmod_propagation', 'eu_compact', 'eu_validate']
+               'vec4_cmod_propagation', 'vec4_dead_code_eliminate',
+               'eu_compact', 'eu_validate']
     test(
       t,
       executable(
diff --git a/src/intel/compiler/test_vec4_dead_code_eliminate.cpp b/src/intel/compiler/test_vec4_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..25739c2895a
--- /dev/null
+++ b/src/intel/compiler/test_vec4_dead_code_eliminate.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class dead_code_eliminate_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class dead_code_eliminate_vec4_visitor : public vec4_visitor
+{
+public:
+   dead_code_eliminate_vec4_visitor(struct brw_compiler *compiler,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+                     false /* no_spills */, -1)
+   {
+      prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int /* location */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int /* mrf */)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void dead_code_eliminate_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+   compiler->devinfo = devinfo;
+
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new dead_code_eliminate_vec4_visitor(compiler, shader, prog_data);
+
+   devinfo->gen = 4;
+}
+
+static void
+dead_code_eliminate(vec4_visitor *v)
+{
+   bool print = false;
+
+   if (print) {
+      fprintf(stderr, "instructions before:\n");
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->dead_code_eliminate();
+
+   if (print) {
+      fprintf(stderr, "instructions after:\n");
+      v->dump_instructions();
+   }
+}
+
+TEST_F(dead_code_eliminate_test, some_dead_channels_all_flags_used)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   src_reg r1 = src_reg(v, glsl_type::vec4_type);
+   src_reg r2 = src_reg(v, glsl_type::vec4_type);
+   src_reg r3 = src_reg(v, glsl_type::vec4_type);
+   src_reg r4 = src_reg(v, glsl_type::vec4_type);
+   src_reg r5 = src_reg(v, glsl_type::vec4_type);
+   src_reg r6 = src_reg(v, glsl_type::vec4_type);
+
+   /* Sequence like the following should not be modified by DCE.
+    *
+    *     cmp.l.f0(8)     g4<1>F         g2<4,4,1>.wF   g1<4,4,1>.xF
+    *     mov(8)          g5<1>.xF       g4<4,4,1>.xF
+    *     (+f0.x) sel(8)  g6<1>UD        g3<4>UD        g6<4>UD
+    */
+   vec4_instruction *test_cmp =
+      bld.CMP(dst_reg(r4), r2, r1, BRW_CONDITIONAL_L);
+
+   test_cmp->src[0].swizzle = BRW_SWIZZLE_WWWW;
+   test_cmp->src[1].swizzle = BRW_SWIZZLE_XXXX;
+
+   vec4_instruction *test_mov =
+      bld.MOV(dst_reg(r5), r4);
+
+   test_mov->dst.writemask = WRITEMASK_X;
+   test_mov->src[0].swizzle = BRW_SWIZZLE_XXXX;
+
+   vec4_instruction *test_sel =
+      bld.SEL(dst_reg(r6), r3, r6);
+
+   set_predicate(BRW_PREDICATE_NORMAL, test_sel);
+
+   /* The scratch write is here just to make r5 and r6 be live so that the
+    * whole program doesn't get eliminated by DCE.
+    */
+   v->emit(v->SCRATCH_WRITE(dst_reg(r4), r6, r5));
+
+   dead_code_eliminate(v);
+
+   EXPECT_EQ(test_cmp->dst.writemask, WRITEMASK_XYZW);
+}
diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml
index abd5da297d6..acded759335 100644
--- a/src/intel/genxml/gen10.xml
+++ b/src/intel/genxml/gen10.xml
@@ -3553,6 +3553,14 @@
     <field name="All Allocation" start="25" end="31" type="uint"/>
   </register>
 
+  <register name="CS_CHICKEN1" length="1" num="0x2580">
+    <field name="Replay Mode" start="0" end="0" type="uint">
+      <value name="Mid-cmdbuffer Preemption" value="0"/>
+      <value name="Object Level Preemption" value="1"/>
+    </field>
+    <field name="Replay Mode Mask" start="16" end="16" type="bool"/>
+  </register>
+
   <register name="SO_WRITE_OFFSET0" length="1" num="0x5280">
     <field name="Write Offset" start="2" end="31" type="offset"/>
   </register>
diff --git a/src/intel/genxml/gen11.xml b/src/intel/genxml/gen11.xml
index c69d7dc89c2..d39bf09a5d7 100644
--- a/src/intel/genxml/gen11.xml
+++ b/src/intel/genxml/gen11.xml
@@ -3551,6 +3551,14 @@
     <field name="All Allocation" start="25" end="31" type="uint"/>
   </register>
 
+  <register name="CS_CHICKEN1" length="1" num="0x2580">
+    <field name="Replay Mode" start="0" end="0" type="uint">
+      <value name="Mid-cmdbuffer Preemption" value="0"/>
+      <value name="Object Level Preemption" value="1"/>
+    </field>
+    <field name="Replay Mode Mask" start="16" end="16" type="bool"/>
+  </register>
+
   <register name="SO_WRITE_OFFSET0" length="1" num="0x5280">
     <field name="Write Offset" start="2" end="31" type="offset"/>
   </register>
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index ca268254503..b7ce3095ab4 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -3491,6 +3491,14 @@
     <field name="All Allocation" start="25" end="31" type="uint"/>
   </register>
 
+  <register name="CS_CHICKEN1" length="1" num="0x2580">
+    <field name="Replay Mode" start="0" end="0" type="uint">
+      <value name="Mid-cmdbuffer Preemption" value="0"/>
+      <value name="Object Level Preemption" value="1"/>
+    </field>
+    <field name="Replay Mode Mask" start="16" end="16" type="bool"/>
+  </register>
+
   <register name="SO_WRITE_OFFSET0" length="1" num="0x5280">
     <field name="Write Offset" start="2" end="31" type="offset"/>
   </register>
diff --git a/src/intel/tools/aubinator_viewer_decoder.cpp b/src/intel/tools/aubinator_viewer_decoder.cpp
index 5311a8afc31..59cde530409 100644
--- a/src/intel/tools/aubinator_viewer_decoder.cpp
+++ b/src/intel/tools/aubinator_viewer_decoder.cpp
@@ -172,7 +172,7 @@ handle_state_base_address(struct aub_viewer_decode_ctx *ctx,
          surface_modify = iter.raw_value;
       } else if (strcmp(iter.name, "Dynamic State Base Address Modify Enable") == 0) {
          dynamic_modify = iter.raw_value;
-      } else if (strcmp(iter.name, "Insntruction Base Address Modify Enable") == 0) {
+      } else if (strcmp(iter.name, "Instruction Base Address Modify Enable") == 0) {
          instruction_modify = iter.raw_value;
       }
    }
diff --git a/src/intel/vulkan/anv_android.c b/src/intel/vulkan/anv_android.c
index 46c41d57861..4720095c6cd 100644
--- a/src/intel/vulkan/anv_android.c
+++ b/src/intel/vulkan/anv_android.c
@@ -128,7 +128,7 @@ anv_image_from_gralloc(VkDevice device_h,
     */
    int dma_buf = gralloc_info->handle->data[0];
 
-   uint64_t bo_flags = 0;
+   uint64_t bo_flags = ANV_BO_EXTERNAL;
    if (device->instance->physicalDevice.supports_48bit_addresses)
       bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
    if (device->instance->physicalDevice.use_softpin)
@@ -234,7 +234,7 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
    *grallocUsage = 0;
    intel_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
 
-   /* WARNING: Android Nougat's libvulkan.so hardcodes the VkImageUsageFlags
+   /* WARNING: Android's libvulkan.so hardcodes the VkImageUsageFlags
     * returned to applications via VkSurfaceCapabilitiesKHR::supportedUsageFlags.
     * The relevant code in libvulkan/swapchain.cpp contains this fun comment:
     *
@@ -247,7 +247,7 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
     * dEQP-VK.wsi.android.swapchain.*.image_usage to fail.
     */
 
-   const VkPhysicalDeviceImageFormatInfo2KHR image_format_info = {
+   VkPhysicalDeviceImageFormatInfo2KHR image_format_info = {
       .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
       .format = format,
       .type = VK_IMAGE_TYPE_2D,
@@ -255,6 +255,17 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
       .usage = imageUsage,
    };
 
+   /* Android P and earlier doesn't check if the physical device supports a
+    * given format and usage combination before calling this function. Omit the
+    * storage requirement to make the tests pass.
+    */
+#if ANDROID_API_LEVEL <= 28
+   if (format == VK_FORMAT_R8G8B8A8_SRGB ||
+       format == VK_FORMAT_R5G6B5_UNORM_PACK16) {
+      image_format_info.usage &= ~VK_IMAGE_USAGE_STORAGE_BIT;
+   }
+#endif
+
    VkImageFormatProperties2KHR image_format_props = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
    };
@@ -268,19 +279,13 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
                        "inside %s", __func__);
    }
 
-   /* Reject STORAGE here to avoid complexity elsewhere. */
-   if (imageUsage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      return vk_errorf(device->instance, device, VK_ERROR_FORMAT_NOT_SUPPORTED,
-                       "VK_IMAGE_USAGE_STORAGE_BIT unsupported for gralloc "
-                       "swapchain");
-   }
-
    if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
                              VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
       *grallocUsage |= GRALLOC_USAGE_HW_RENDER;
 
    if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                              VK_IMAGE_USAGE_SAMPLED_BIT |
+                             VK_IMAGE_USAGE_STORAGE_BIT |
                              VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
       *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE;
 
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 478b8e7a3db..acc9a22c484 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -1658,6 +1658,7 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
 void
 anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
                  const struct anv_image *image,
+                 enum isl_format format,
                  VkImageAspectFlagBits aspect,
                  uint32_t base_layer, uint32_t layer_count,
                  enum isl_aux_op mcs_op, union isl_color_value *clear_value,
@@ -1713,12 +1714,12 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
 
    switch (mcs_op) {
    case ISL_AUX_OP_FAST_CLEAR:
-      blorp_fast_clear(&batch, &surf, surf.surf->format,
+      blorp_fast_clear(&batch, &surf, format,
                        0, base_layer, layer_count,
                        0, 0, image->extent.width, image->extent.height);
       break;
    case ISL_AUX_OP_PARTIAL_RESOLVE:
-      blorp_mcs_partial_resolve(&batch, &surf, surf.surf->format,
+      blorp_mcs_partial_resolve(&batch, &surf, format,
                                 base_layer, layer_count);
       break;
    case ISL_AUX_OP_FULL_RESOLVE:
@@ -1736,6 +1737,7 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
 void
 anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
                  const struct anv_image *image,
+                 enum isl_format format,
                  VkImageAspectFlagBits aspect, uint32_t level,
                  uint32_t base_layer, uint32_t layer_count,
                  enum isl_aux_op ccs_op, union isl_color_value *clear_value,
@@ -1799,14 +1801,14 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
 
    switch (ccs_op) {
    case ISL_AUX_OP_FAST_CLEAR:
-      blorp_fast_clear(&batch, &surf, surf.surf->format,
+      blorp_fast_clear(&batch, &surf, format,
                        level, base_layer, layer_count,
                        0, 0, level_width, level_height);
       break;
    case ISL_AUX_OP_FULL_RESOLVE:
    case ISL_AUX_OP_PARTIAL_RESOLVE:
       blorp_ccs_resolve(&batch, &surf, level, base_layer, layer_count,
-                        surf.surf->format, ccs_op);
+                        format, ccs_op);
       break;
    case ISL_AUX_OP_AMBIGUATE:
       for (uint32_t a = 0; a < layer_count; a++) {
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index ee35e013329..924470b3005 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -636,7 +636,7 @@ VkResult anv_CreateInstance(
    }
 
    if (instance->app_info.api_version == 0)
-      anv_EnumerateInstanceVersion(&instance->app_info.api_version);
+      instance->app_info.api_version = VK_API_VERSION_1_0;
 
    instance->enabled_extensions = enabled_extensions;
 
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index e9afe06bb13..dd7111b58e1 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -71,9 +71,9 @@ def __init__(self, version, enable):
 EXTENSIONS = [
     Extension('VK_ANDROID_native_buffer',                 5, 'ANDROID'),
     Extension('VK_KHR_16bit_storage',                     1, 'device->info.gen >= 8'),
-    Extension('VK_KHR_8bit_storage',                      1, 'device->info.gen >= 8'),
+    Extension('VK_KHR_8bit_storage',                      1, 'device->info.gen >= 8 && !ANDROID'),
     Extension('VK_KHR_bind_memory2',                      1, True),
-    Extension('VK_KHR_create_renderpass2',                1, True),
+    Extension('VK_KHR_create_renderpass2',                1, '!ANDROID'),
     Extension('VK_KHR_dedicated_allocation',              1, True),
     Extension('VK_KHR_descriptor_update_template',        1, True),
     Extension('VK_KHR_device_group',                      1, True),
@@ -121,7 +121,7 @@ def __init__(self, version, enable):
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
     Extension('VK_EXT_global_priority',                   1,
               'device->has_context_priority'),
-    Extension('VK_EXT_pci_bus_info',                      1, True),
+    Extension('VK_EXT_pci_bus_info',                      1, False),
     Extension('VK_EXT_shader_viewport_index_layer',       1, True),
     Extension('VK_EXT_shader_stencil_export',             1, 'device->info.gen >= 9'),
     Extension('VK_EXT_vertex_attribute_divisor',          3, True),
diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c
index ed1bc096c66..f6b8ded20a9 100644
--- a/src/intel/vulkan/anv_intel.c
+++ b/src/intel/vulkan/anv_intel.c
@@ -64,7 +64,8 @@ VkResult anv_CreateDmaBufImageINTEL(
          .samples = 1,
          /* FIXME: Need a way to use X tiling to allow scanout */
          .tiling = VK_IMAGE_TILING_OPTIMAL,
-         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+         .usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_SAMPLED_BIT,
          .flags = 0,
       }},
       pAllocator, &image_h);
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index 6868288e486..60c196e5c4f 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -171,6 +171,8 @@ lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin,
 {
    nir_builder *b = &state->builder;
 
+   b->cursor = nir_before_instr(&intrin->instr);
+
    /* For us, the resource indices are just indices into the binding table and
     * array elements are sequential.  A resource_reindex just turns into an
     * add of the two indices.
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index ad0f08253e7..f170366d030 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -446,6 +446,9 @@ anv_pipeline_hash_graphics(struct anv_pipeline *pipeline,
    if (layout)
       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
 
+   const bool rba = pipeline->device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
    for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
       if (stages[s].entrypoint)
          anv_pipeline_hash_shader(&ctx, &stages[s]);
@@ -466,6 +469,9 @@ anv_pipeline_hash_compute(struct anv_pipeline *pipeline,
    if (layout)
       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
 
+   const bool rba = pipeline->device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
    anv_pipeline_hash_shader(&ctx, stage);
 
    _mesa_sha1_final(&ctx, sha1_out);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index d8a08d9d67f..37c710ad09a 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1747,6 +1747,13 @@ enum anv_pipe_bits {
     * we would have to CS stall on every flush which could be bad.
     */
    ANV_PIPE_NEEDS_CS_STALL_BIT               = (1 << 21),
+
+   /* This bit does not exist directly in PIPE_CONTROL. It means that render
+    * target operations are ongoing. Some operations like copies on the
+    * command streamer might need to be aware of this to trigger the
+    * appropriate stall before they can proceed with the copy.
+    */
+   ANV_PIPE_RENDER_TARGET_WRITES              = (1 << 22),
 };
 
 #define ANV_PIPE_FLUSH_BITS ( \
@@ -2950,6 +2957,7 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
 void
 anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
                  const struct anv_image *image,
+                 enum isl_format format,
                  VkImageAspectFlagBits aspect,
                  uint32_t base_layer, uint32_t layer_count,
                  enum isl_aux_op mcs_op, union isl_color_value *clear_value,
@@ -2957,6 +2965,7 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
 void
 anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
                  const struct anv_image *image,
+                 enum isl_format format,
                  VkImageAspectFlagBits aspect, uint32_t level,
                  uint32_t base_layer, uint32_t layer_count,
                  enum isl_aux_op ccs_op, union isl_color_value *clear_value,
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 2035017ce0e..c573e890946 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -263,4 +263,5 @@ genX(blorp_exec)(struct blorp_batch *batch,
    cmd_buffer->state.gfx.vb_dirty = ~0;
    cmd_buffer->state.gfx.dirty = ~0;
    cmd_buffer->state.push_constants_dirty = ~0;
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 43a02f22567..dcf37654954 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -737,6 +737,7 @@ anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
 static void
 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
                                const struct anv_image *image,
+                               enum isl_format format,
                                VkImageAspectFlagBits aspect,
                                uint32_t level, uint32_t array_layer,
                                enum isl_aux_op resolve_op,
@@ -761,13 +762,14 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
        image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
 
-   anv_image_ccs_op(cmd_buffer, image, aspect, level,
+   anv_image_ccs_op(cmd_buffer, image, format, aspect, level,
                     array_layer, 1, resolve_op, NULL, true);
 }
 
 static void
 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
                                const struct anv_image *image,
+                               enum isl_format format,
                                VkImageAspectFlagBits aspect,
                                uint32_t array_layer,
                                enum isl_aux_op resolve_op,
@@ -781,7 +783,7 @@ anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
                                      aspect, 0, array_layer,
                                      resolve_op, fast_clear_supported);
 
-   anv_image_mcs_op(cmd_buffer, image, aspect,
+   anv_image_mcs_op(cmd_buffer, image, format, aspect,
                     array_layer, 1, resolve_op, NULL, true);
 #else
    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
@@ -1037,8 +1039,9 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
             uint32_t level_layer_count =
                MIN2(layer_count, aux_layers - base_layer);
 
-            anv_image_ccs_op(cmd_buffer, image, aspect, level,
-                             base_layer, level_layer_count,
+            anv_image_ccs_op(cmd_buffer, image,
+                             image->planes[plane].surface.isl.format,
+                             aspect, level, base_layer, level_layer_count,
                              ISL_AUX_OP_AMBIGUATE, NULL, false);
 
             if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
@@ -1055,8 +1058,9 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
          }
 
          assert(base_level == 0 && level_count == 1);
-         anv_image_mcs_op(cmd_buffer, image, aspect,
-                          base_layer, layer_count,
+         anv_image_mcs_op(cmd_buffer, image,
+                          image->planes[plane].surface.isl.format,
+                          aspect, base_layer, layer_count,
                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
       }
       return;
@@ -1133,12 +1137,22 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
       for (uint32_t a = 0; a < level_layer_count; a++) {
          uint32_t array_layer = base_layer + a;
          if (image->samples == 1) {
-            anv_cmd_predicated_ccs_resolve(cmd_buffer, image, aspect,
-                                           level, array_layer, resolve_op,
+            anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
+                                           image->planes[plane].surface.isl.format,
+                                           aspect, level, array_layer, resolve_op,
                                            final_fast_clear);
          } else {
-            anv_cmd_predicated_mcs_resolve(cmd_buffer, image, aspect,
-                                           array_layer, resolve_op,
+            /* We only support fast-clear on the first layer so partial
+             * resolves should not be used on other layers as they will use
+             * the clear color stored in memory that is only valid for layer0.
+             */
+            if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
+                array_layer != 0)
+               continue;
+
+            anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
+                                           image->planes[plane].surface.isl.format,
+                                           aspect, array_layer, resolve_op,
                                            final_fast_clear);
          }
       }
@@ -1758,6 +1772,12 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
             pipe.StallAtPixelScoreboard = true;
       }
 
+      /* If a render target flush was emitted, then we can toggle off the bit
+       * saying that render target writes are ongoing.
+       */
+      if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+         bits &= ~(ANV_PIPE_RENDER_TARGET_WRITES);
+
       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_CS_STALL_BIT);
    }
 
@@ -2769,6 +2789,8 @@ void genX(CmdDraw)(
       prim.StartInstanceLocation    = firstInstance;
       prim.BaseVertexLocation       = 0;
    }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
 
 void genX(CmdDrawIndexed)(
@@ -2808,6 +2830,8 @@ void genX(CmdDrawIndexed)(
       prim.StartInstanceLocation    = firstInstance;
       prim.BaseVertexLocation       = vertexOffset;
    }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
 
 /* Auto-Draw / Indirect Registers */
@@ -2941,6 +2965,8 @@ void genX(CmdDrawIndirect)(
 
       offset += stride;
    }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
 
 void genX(CmdDrawIndexedIndirect)(
@@ -2980,6 +3006,8 @@ void genX(CmdDrawIndexedIndirect)(
 
       offset += stride;
    }
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
 
 static VkResult
@@ -3645,12 +3673,16 @@ cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
             union isl_color_value clear_color = {};
             anv_clear_color_from_att_state(&clear_color, att_state, iview);
             if (iview->image->samples == 1) {
-               anv_image_ccs_op(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
+               anv_image_ccs_op(cmd_buffer, image,
+                                iview->planes[0].isl.format,
+                                VK_IMAGE_ASPECT_COLOR_BIT,
                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
                                 &clear_color,
                                 false);
             } else {
-               anv_image_mcs_op(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
+               anv_image_mcs_op(cmd_buffer, image,
+                                iview->planes[0].isl.format,
+                                VK_IMAGE_ASPECT_COLOR_BIT,
                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
                                 &clear_color,
                                 false);
@@ -3870,6 +3902,55 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
       struct anv_image_view *iview = fb->attachments[a];
       const struct anv_image *image = iview->image;
 
+      if ((image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) &&
+          image->vk_format != iview->vk_format) {
+         enum anv_fast_clear_type fast_clear_type =
+            anv_layout_to_fast_clear_type(&cmd_buffer->device->info,
+                                          image, VK_IMAGE_ASPECT_COLOR_BIT,
+                                          att_state->current_layout);
+
+         /* If any clear color was used, flush it down the aux surfaces. If we
+          * don't do it now using the view's format we might use the clear
+          * color incorrectly in the following resolves (for example with an
+          * SRGB view & a UNORM image).
+          */
+         if (fast_clear_type != ANV_FAST_CLEAR_NONE) {
+            anv_perf_warn(cmd_buffer->device->instance, fb,
+                          "Doing a partial resolve to get rid of clear color at the "
+                          "end of a renderpass due to an image/view format mismatch");
+
+            uint32_t base_layer, layer_count;
+            if (image->type == VK_IMAGE_TYPE_3D) {
+               base_layer = 0;
+               layer_count = anv_minify(iview->image->extent.depth,
+                                        iview->planes[0].isl.base_level);
+            } else {
+               base_layer = iview->planes[0].isl.base_array_layer;
+               layer_count = fb->layers;
+            }
+
+            for (uint32_t a = 0; a < layer_count; a++) {
+               uint32_t array_layer = base_layer + a;
+               if (image->samples == 1) {
+                  anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
+                                                 iview->planes[0].isl.format,
+                                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                                 iview->planes[0].isl.base_level,
+                                                 array_layer,
+                                                 ISL_AUX_OP_PARTIAL_RESOLVE,
+                                                 ANV_FAST_CLEAR_NONE);
+               } else {
+                  anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
+                                                 iview->planes[0].isl.format,
+                                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                                 base_layer,
+                                                 ISL_AUX_OP_PARTIAL_RESOLVE,
+                                                 ANV_FAST_CLEAR_NONE);
+               }
+            }
+         }
+      }
+
       /* Transition the image into the final layout for this render pass */
       VkImageLayout target_layout =
          cmd_state->pass->attachments[a].final_layout;
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 81522986550..1bee1c6dc17 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -302,4 +302,5 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
    }
 
    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_WRITES;
 }
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index ce8757f2643..71b7a1352f0 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -729,11 +729,19 @@ void genX(CmdCopyQueryPoolResults)(
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
 
-   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard     = true;
-      }
+   /* If render target writes are ongoing, request a render target cache flush
+    * to ensure proper ordering of the commands from the 3d pipe and the
+    * command streamer.
+    */
+   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_WRITES) {
+      cmd_buffer->state.pending_pipe_bits |=
+         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+   }
+
+   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
+       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
+      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
    }
 
    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
index 42800a2581e..99b86f68b3a 100644
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -231,6 +231,22 @@ genX(init_device_state)(struct anv_device *device)
 #endif
    }
 
+#if GEN_GEN >= 10
+   /* A fixed function pipe flush is required before modifying this field */
+   anv_batch_emit(&batch, GENX(PIPE_CONTROL), pipe) {
+      pipe.PipeControlFlushEnable = true;
+   }
+   /* enable object level preemption */
+   uint32_t csc1;
+   anv_pack_struct(&csc1, GENX(CS_CHICKEN1),
+                   .ReplayMode = ObjectLevelPreemption,
+                   .ReplayModeMask = 1);
+   anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = GENX(CS_CHICKEN1_num);
+      lri.DataDWord        = csc1;
+   }
+#endif
+
    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
 
    assert(batch.next <= batch.end);
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 1981b5f0515..7cd6b1e8ab6 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -1509,6 +1509,7 @@ dri3_update_drawable(struct loader_dri3_drawable *draw)
             mtx_unlock(&draw->mtx);
             return false;
          }
+         free(error);
          draw->is_pixmap = true;
          xcb_unregister_for_special_event(draw->conn, draw->special_event);
          draw->special_event = NULL;
diff --git a/src/mapi/shared-glapi/meson.build b/src/mapi/shared-glapi/meson.build
index dcc6079af3d..3f041471fb9 100644
--- a/src/mapi/shared-glapi/meson.build
+++ b/src/mapi/shared-glapi/meson.build
@@ -40,7 +40,7 @@ libglapi = shared_library(
   'glapi',
   [files_mapi_glapi, files_mapi_util, shared_glapi_mapi_tmp_h],
   c_args : [
-    c_msvc_compat_args, '-DMAPI_MODE_GLAPI',
+    c_msvc_compat_args, c_vis_args, '-DMAPI_MODE_GLAPI',
     '-DMAPI_ABI_HEADER="@0@"'.format(shared_glapi_mapi_tmp_h.full_path()),
   ],
   link_args : [ld_args_gc_sections],
diff --git a/src/mesa/drivers/dri/Android.mk b/src/mesa/drivers/dri/Android.mk
index 53ff4b4f632..dc1f98364c8 100644
--- a/src/mesa/drivers/dri/Android.mk
+++ b/src/mesa/drivers/dri/Android.mk
@@ -49,11 +49,18 @@ MESA_DRI_WHOLE_STATIC_LIBRARIES := \
 MESA_DRI_SHARED_LIBRARIES := \
 	libcutils \
 	libdl \
-	libexpat \
 	libglapi \
 	liblog \
 	libz
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+MESA_DRI_WHOLE_STATIC_LIBRARIES += \
+	libexpat
+else
+MESA_DRI_SHARED_LIBRARIES += \
+	libexpat
+endif
 #-----------------------------------------------
 # Build drivers and libmesa_dri_common
 
diff --git a/src/mesa/drivers/dri/i915/Android.mk b/src/mesa/drivers/dri/i915/Android.mk
index b1054aa6e28..7c9c8210dff 100644
--- a/src/mesa/drivers/dri/i915/Android.mk
+++ b/src/mesa/drivers/dri/i915/Android.mk
@@ -47,7 +47,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
 
 LOCAL_SHARED_LIBRARIES := \
 	$(MESA_DRI_SHARED_LIBRARIES) \
-	libdrm_intel
+	libdrm_intel_pri
 
 LOCAL_GENERATED_SOURCES := \
 	$(MESA_DRI_OPTIONS_H) \
diff --git a/src/mesa/drivers/dri/i965/Android.mk b/src/mesa/drivers/dri/i965/Android.mk
index fbad63a0824..580b5443965 100644
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -310,6 +310,8 @@ LOCAL_LDFLAGS += $(MESA_DRI_LDFLAGS)
 LOCAL_CFLAGS := \
 	$(MESA_DRI_CFLAGS)
 
+LOCAL_CFLAGS += -Wno-error
+
 LOCAL_C_INCLUDES := \
 	$(MESA_DRI_C_INCLUDES) \
 	$(MESA_TOP)/include/drm-uapi
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index f1675b191c1..d4e6ba039c9 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1487,7 +1487,7 @@ brw_bo_gem_export_to_prime(struct brw_bo *bo, int *prime_fd)
    brw_bo_make_external(bo);
 
    if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
-                          DRM_CLOEXEC, prime_fd) != 0)
+			  DRM_CLOEXEC | DRM_RDWR, prime_fd) != 0)
       return -errno;
 
    bo->reusable = false;
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 6ba64e4e06d..8cc0529d7e8 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -893,6 +893,19 @@ brw_process_driconf_options(struct brw_context *brw)
    ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20);
    driComputeOptionsSha1(&brw->screen->optionCache,
                          ctx->Const.dri_config_options_sha1);
+
+   brw->screen->compiler->simd32_heuristics_control.grouped_sends_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check");
+   brw->screen->compiler->simd32_heuristics_control.max_grouped_sends =
+      driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends");
+   brw->screen->compiler->simd32_heuristics_control.inst_count_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check");
+   brw->screen->compiler->simd32_heuristics_control.inst_count_ratio =
+      driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio");
+   brw->screen->compiler->simd32_heuristics_control.mrt_check =
+      driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check");
+   brw->screen->compiler->simd32_heuristics_control.max_mrts =
+      driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts");
 }
 
 GLboolean
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 7fd15669eb9..47183da66bc 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -686,6 +686,7 @@ enum brw_query_kind {
    OA_COUNTERS,
    OA_COUNTERS_RAW,
    PIPELINE_STATS,
+   NULL_RENDERER,
 };
 
 struct brw_perf_query_register_prog {
@@ -842,6 +843,8 @@ struct brw_context
 
    GLuint primitive; /**< Hardware primitive, such as _3DPRIM_TRILIST. */
 
+   bool object_preemption; /**< Object level preemption enabled. */
+
    GLenum reduced_primitive;
 
    /**
@@ -1242,6 +1245,7 @@ struct brw_context
 
       int n_active_oa_queries;
       int n_active_pipeline_stats_queries;
+      int n_active_null_renderers;
 
       /* The number of queries depending on running OA counters which
        * extends beyond brw_end_perf_query() since we need to wait until
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 97a787a2ab3..0ec50e1d27a 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1650,11 +1650,17 @@ enum brw_pixel_shader_coverage_mask_mode {
 #define GEN10_CACHE_MODE_SS            0x0e420
 #define GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE (1 << 4)
 
-#define INSTPM                             0x20c0
+#define INSTPM                             0x20c0 /* Gen6-8 */
 # define INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 6)
+# define INSTPM_GLOBAL_DEBUG_ENABLE                    (1 << 4)
+# define INSTPM_MEDIA_INSTRUCTION_DISABLE              (1 << 3)
+# define INSTPM_3D_RENDERER_INSTRUCTION_DISABLE        (1 << 2)
+# define INSTPM_3D_STATE_INSTRUCTION_DISABLE           (1 << 1)
 
 #define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
 # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
+# define CSDBG2_MEDIA_INSTRUCTION_DISABLE              (1 << 1)
+# define CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE        (1 << 0)
 
 #define GEN7_RPSTAT1                       0xA01C
 #define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
@@ -1681,4 +1687,9 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS        (1 << 5)
 # define HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS_MASK   REG_MASK(1 << 5)
 
+#define CS_CHICKEN1                        0x2580 /* Gen9+ */
+# define GEN9_REPLAY_MODE_MIDBUFFER             (0 << 0)
+# define GEN9_REPLAY_MODE_MIDOBJECT             (1 << 0)
+# define GEN9_REPLAY_MODE_MASK                  REG_MASK(1 << 0)
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 10e3d024f17..85d14a83c7e 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -330,6 +330,12 @@ dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
           o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
           obj->pipeline_stats.bo ? "yes" : "no");
       break;
+   case NULL_RENDERER:
+      DBG("%4d: %-6s %-8s NULL_RENDERER\n",
+          id,
+          o->Used ? "Dirty," : "New,",
+          o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"));
+      break;
    default:
       unreachable("Unknown query type");
       break;
@@ -431,6 +437,10 @@ brw_get_perf_query_info(struct gl_context *ctx,
       *n_active = brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      *n_active = brw->perfquery.n_active_null_renderers;
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1020,6 +1030,7 @@ brw_begin_perf_query(struct gl_context *ctx,
    struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
    const struct brw_perf_query_info *query = obj->query;
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    /* We can assume the frontend hides mistaken attempts to Begin a
     * query object multiple times before its End. Similarly if an
@@ -1104,7 +1115,6 @@ brw_begin_perf_query(struct gl_context *ctx,
       /* If the OA counters aren't already on, enable them. */
       if (brw->perfquery.oa_stream_fd == -1) {
          __DRIscreen *screen = brw->screen->driScrnPriv;
-         const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
          /* The period_exponent gives a sampling period as follows:
           *   sample_period = timestamp_period * 2^(period_exponent + 1)
@@ -1250,6 +1260,23 @@ brw_begin_perf_query(struct gl_context *ctx,
       ++brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      ++brw->perfquery.n_active_null_renderers;
+      if (devinfo->gen >= 9) {
+         brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+                                 REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE) |
+                                 CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE);
+      } else {
+         brw_load_register_imm32(brw, INSTPM,
+                                 REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                          INSTPM_MEDIA_INSTRUCTION_DISABLE) |
+                                 INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                 INSTPM_MEDIA_INSTRUCTION_DISABLE);
+      }
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1270,6 +1297,7 @@ brw_end_perf_query(struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    DBG("End(%d)\n", o->Id);
 
@@ -1312,6 +1340,21 @@ brw_end_perf_query(struct gl_context *ctx,
       --brw->perfquery.n_active_pipeline_stats_queries;
       break;
 
+   case NULL_RENDERER:
+      if (--brw->perfquery.n_active_null_renderers == 0) {
+         if (devinfo->gen >= 9) {
+            brw_load_register_imm32(brw, CS_DEBUG_MODE2,
+                                    REG_MASK(CSDBG2_3D_RENDERER_INSTRUCTION_DISABLE));
+         } else {
+            brw_load_register_imm32(brw, INSTPM,
+                                    REG_MASK(INSTPM_3D_RENDERER_INSTRUCTION_DISABLE |
+                                             INSTPM_MEDIA_INSTRUCTION_DISABLE));
+         }
+         brw_emit_pipe_control_flush(brw,
+                                     PIPE_CONTROL_LRI_WRITE_IMMEDIATE);
+      }
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1337,6 +1380,9 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
       bo = obj->pipeline_stats.bo;
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1387,6 +1433,8 @@ brw_is_perf_query_ready(struct gl_context *ctx,
       return (obj->pipeline_stats.bo &&
               !brw_batch_references(&brw->batch, obj->pipeline_stats.bo) &&
               !brw_bo_busy(obj->pipeline_stats.bo));
+   case NULL_RENDERER:
+      return true;
 
    default:
       unreachable("Unknown query type");
@@ -1602,6 +1650,9 @@ brw_get_perf_query_data(struct gl_context *ctx,
       written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -1672,6 +1723,9 @@ brw_delete_perf_query(struct gl_context *ctx,
       }
       break;
 
+   case NULL_RENDERER:
+      break;
+
    default:
       unreachable("Unknown query type");
       break;
@@ -2152,6 +2206,15 @@ get_register_queries_function(const struct gen_device_info *devinfo)
    return NULL;
 }
 
+static void
+fill_null_renderer_perf_query_info(struct brw_context *brw,
+                                   struct brw_perf_query_info *query)
+{
+   query->kind = NULL_RENDERER;
+   query->name = "Intel_Null_Hardware_Query";
+   query->n_counters = 0;
+}
+
 static unsigned
 brw_init_perf_query_info(struct gl_context *ctx)
 {
@@ -2210,6 +2273,10 @@ brw_init_perf_query_info(struct gl_context *ctx)
          enumerate_sysfs_metrics(brw);
 
       brw_perf_query_register_mdapi_oa_query(brw);
+
+      struct brw_perf_query_info *null_query =
+         brw_perf_query_append_query_info(brw);
+      fill_null_renderer_perf_query_info(brw, null_query);
    }
 
    brw->perfquery.unaccumulated =
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index f6acf81b899..546d103d1a4 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -128,7 +128,7 @@ void brw_disk_cache_write_compute_program(struct brw_context *brw);
 void brw_disk_cache_write_render_programs(struct brw_context *brw);
 
 /***********************************************************************
- * brw_state.c
+ * brw_state_upload.c
  */
 void brw_upload_render_state(struct brw_context *brw);
 void brw_render_state_finished(struct brw_context *brw);
@@ -138,6 +138,7 @@ void brw_init_state(struct brw_context *brw);
 void brw_destroy_state(struct brw_context *brw);
 void brw_emit_select_pipeline(struct brw_context *brw,
                               enum brw_pipeline pipeline);
+void brw_enable_obj_preemption(struct brw_context *brw, bool enable);
 
 static inline void
 brw_select_pipeline(struct brw_context *brw, enum brw_pipeline pipeline)
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 7f20579fb87..2e42dfb36d6 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -45,6 +45,28 @@
 #include "brw_cs.h"
 #include "main/framebuffer.h"
 
+void
+brw_enable_obj_preemption(struct brw_context *brw, bool enable)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   assert(devinfo->gen >= 9);
+
+   if (enable == brw->object_preemption)
+      return;
+
+   /* A fixed function pipe flush is required before modifying this field */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
+
+   bool replay_mode = enable ?
+      GEN9_REPLAY_MODE_MIDOBJECT : GEN9_REPLAY_MODE_MIDBUFFER;
+
+   /* enable object level preemption */
+   brw_load_register_imm32(brw, CS_CHICKEN1,
+                           replay_mode | GEN9_REPLAY_MODE_MASK);
+
+   brw->object_preemption = enable;
+}
+
 static void
 brw_upload_initial_gpu_state(struct brw_context *brw)
 {
@@ -153,6 +175,9 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
          ADVANCE_BATCH();
       }
    }
+
+   if (devinfo->gen >= 10)
+      brw_enable_obj_preemption(brw, true);
 }
 
 static inline const struct brw_tracked_state *
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 8d21cf5fa70..3286c222e5b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1499,18 +1499,6 @@ update_buffer_image_param(struct brw_context *brw,
    param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
 }
 
-static unsigned
-get_image_num_layers(const struct intel_mipmap_tree *mt, GLenum target,
-                     unsigned level)
-{
-   if (target == GL_TEXTURE_CUBE_MAP)
-      return 6;
-
-   return target == GL_TEXTURE_3D ?
-      minify(mt->surf.logical_level0_px.depth, level) :
-      mt->surf.logical_level0_px.array_len;
-}
-
 static void
 update_image_surface(struct brw_context *brw,
                      struct gl_image_unit *u,
@@ -1541,14 +1529,29 @@ update_image_surface(struct brw_context *brw,
       } else {
          struct intel_texture_object *intel_obj = intel_texture_object(obj);
          struct intel_mipmap_tree *mt = intel_obj->mt;
-         const unsigned num_layers = u->Layered ?
-            get_image_num_layers(mt, obj->Target, u->Level) : 1;
+
+         unsigned base_layer, num_layers;
+         if (u->Layered) {
+            if (obj->Target == GL_TEXTURE_3D) {
+               base_layer = 0;
+               num_layers = minify(mt->surf.logical_level0_px.depth, u->Level);
+            } else {
+               assert(obj->Immutable || obj->MinLayer == 0);
+               base_layer = obj->MinLayer;
+               num_layers = obj->Immutable ?
+                                obj->NumLayers :
+                                mt->surf.logical_level0_px.array_len;
+            }
+         } else {
+            base_layer = obj->MinLayer + u->_Layer;
+            num_layers = 1;
+         }
 
          struct isl_view view = {
             .format = format,
             .base_level = obj->MinLevel + u->Level,
             .levels = 1,
-            .base_array_layer = obj->MinLayer + u->_Layer,
+            .base_array_layer = base_layer,
             .array_len = num_layers,
             .swizzle = ISL_SWIZZLE_IDENTITY,
             .usage = ISL_SURF_USAGE_STORAGE_BIT,
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index fd9ce93c6c7..97ae2707049 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -213,7 +213,7 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
    }
 
    if (need_invalidate) {
-      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
    }
 #endif
 }
@@ -268,7 +268,7 @@ genX(blorp_exec)(struct blorp_batch *batch,
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
    struct gl_context *ctx = &brw->ctx;
-   bool check_aperture_failed_once;
+   bool check_aperture_failed_once = false;
 
 #if GEN_GEN >= 11
    /* The PIPE_CONTROL command description says:
@@ -309,7 +309,7 @@ genX(blorp_exec)(struct blorp_batch *batch,
    intel_batchbuffer_require_space(brw, 1400);
    brw_require_statebuffer_space(brw, 600);
    intel_batchbuffer_save_state(brw);
-   check_aperture_failed_once = intel_batchbuffer_saved_state_is_empty(brw);
+   check_aperture_failed_once |= intel_batchbuffer_saved_state_is_empty(brw);
    brw->batch.no_wrap = true;
 
 #if GEN_GEN == 6
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index 9cd017a5cff..5d2572cb4dc 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -505,9 +505,8 @@ vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
 {
 #if GEN_GEN >= 8
    bool need_invalidate = false;
-   unsigned i;
 
-   for (i = 0; i < brw->vb.nr_buffers; i++) {
+   for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
       uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
 
       if (high_bits != brw->vb.last_bo_high_bits[i]) {
@@ -516,12 +515,26 @@ vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
       }
    }
 
-   /* Don't bother with draw parameter buffers - those are generated by
-    * the driver so we can select a consistent memory zone.
-    */
+   if (brw->draw.draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
+      }
+   }
+
+   if (brw->draw.derived_draw_params_bo) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
+
+      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
+      }
+   }
 
    if (need_invalidate) {
-      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
    }
 #endif
 }
@@ -5602,6 +5615,50 @@ static const struct brw_tracked_state genX(blend_constant_color) = {
 
 /* ---------------------------------------------------------------------- */
 
+#if GEN_GEN == 9
+
+/**
+ * Implement workarounds for preemption:
+ *    - WaDisableMidObjectPreemptionForGSLineStripAdj
+ *    - WaDisableMidObjectPreemptionForTrifanOrPolygon
+ */
+static void
+gen9_emit_preempt_wa(struct brw_context *brw)
+{
+   /* WaDisableMidObjectPreemptionForGSLineStripAdj
+    *
+    *    WA: Disable mid-draw preemption when draw-call is a linestrip_adj and
+    *    GS is enabled.
+    */
+   bool object_preemption =
+      !(brw->primitive == _3DPRIM_LINESTRIP_ADJ && brw->gs.enabled);
+
+   /* WaDisableMidObjectPreemptionForTrifanOrPolygon
+    *
+    *    TriFan miscompare in Execlist Preemption test. Cut index that is on a
+    *    previous context. End the previous, the resume another context with a
+    *    tri-fan or polygon, and the vertex count is corrupted. If we prempt
+    *    again we will cause corruption.
+    *
+    *    WA: Disable mid-draw preemption when draw-call has a tri-fan.
+    */
+   object_preemption =
+      object_preemption && !(brw->primitive == _3DPRIM_TRIFAN);
+
+   brw_enable_obj_preemption(brw, object_preemption);
+}
+
+static const struct brw_tracked_state gen9_preempt_wa = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM,
+   },
+   .emit = gen9_emit_preempt_wa,
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 void
 genX(init_atoms)(struct brw_context *brw)
 {
@@ -5906,6 +5963,9 @@ genX(init_atoms)(struct brw_context *brw)
 
       &genX(cut_index),
       &gen8_pma_fix,
+#if GEN_GEN == 9
+      &gen9_preempt_wa,
+#endif
    };
 #endif
 
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index d7e02efb54d..0cfe2acbdd4 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -247,7 +247,6 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.OES_primitive_bounding_box = true;
       ctx->Extensions.OES_texture_buffer = true;
       ctx->Extensions.ARB_fragment_shader_interlock = true;
-      ctx->Extensions.INTEL_fragment_shader_ordering = true;
 
       if (can_do_pipelined_register_writes(brw->screen)) {
          ctx->Extensions.ARB_draw_indirect = true;
diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
index a8193c6def9..ca604159dc2 100644
--- a/src/mesa/drivers/dri/i965/intel_image.h
+++ b/src/mesa/drivers/dri/i965/intel_image.h
@@ -89,9 +89,6 @@ struct __DRIimageRec {
    GLuint tile_y;
    bool has_depthstencil;
 
-   /** The image was created with EGL_EXT_image_dma_buf_import. */
-   bool dma_buf_imported;
-
    /** Offset of the auxiliary compression surface in the bo. */
    uint32_t aux_offset;
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index c3bd30f7837..89110e60a8d 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -61,6 +61,33 @@ DRI_CONF_BEGIN
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable grouped texture fetch "
+                            "check in the SIMD32 selection heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999")
+             DRI_CONF_DESC(en, "How many grouped texture fetches should "
+                            "the SIMD32 selection heuristic allow.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction "
+                            "count ratio check in the SIMD32 selection "
+                            "heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999")
+              DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio "
+                            "the SIMD32 selection heuristic should allow.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true")
+              DRI_CONF_DESC(en, "Enable/disable MRT write check in the "
+                            "SIMD32 selection heuristic.")
+      DRI_CONF_OPT_END
+      DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8")
+              DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 "
+                            "selection heuristic allow.")
+      DRI_CONF_OPT_END
+
       DRI_CONF_MESA_NO_ERROR("false")
    DRI_CONF_SECTION_END
 
@@ -282,6 +309,18 @@ static const struct intel_image_format intel_image_formats[] = {
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 1, __DRI_IMAGE_FORMAT_GR88, 2 } } },
 
+   { __DRI_IMAGE_FOURCC_P010, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+   { __DRI_IMAGE_FOURCC_P012, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
+   { __DRI_IMAGE_FOURCC_P016, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_R16, 2 },
+       { 1, 1, 1, __DRI_IMAGE_FORMAT_GR1616, 4 } } },
+
    { __DRI_IMAGE_FOURCC_NV16, __DRI_IMAGE_COMPONENTS_Y_UV, 2,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_R8, 1 },
        { 1, 1, 0, __DRI_IMAGE_FORMAT_GR88, 2 } } },
@@ -957,7 +996,6 @@ intel_dup_image(__DRIimage *orig_image, void *loaderPrivate)
    image->tile_y          = orig_image->tile_y;
    image->has_depthstencil = orig_image->has_depthstencil;
    image->data            = loaderPrivate;
-   image->dma_buf_imported = orig_image->dma_buf_imported;
    image->aux_offset      = orig_image->aux_offset;
    image->aux_pitch       = orig_image->aux_pitch;
 
@@ -1237,7 +1275,6 @@ intel_create_image_from_dma_bufs2(__DRIscreen *dri_screen,
       return NULL;
    }
 
-   image->dma_buf_imported = true;
    image->yuv_color_space = yuv_color_space;
    image->sample_range = sample_range;
    image->horizontal_siting = horizontal_siting;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index bdcdb7736e6..674fa1c6fbf 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -614,16 +614,6 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
    if (image == NULL)
       return;
 
-   /* We support external textures only for EGLImages created with
-    * EGL_EXT_image_dma_buf_import. We may lift that restriction in the future.
-    */
-   if (target == GL_TEXTURE_EXTERNAL_OES && !image->dma_buf_imported) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-            "glEGLImageTargetTexture2DOES(external target is enabled only "
-               "for images created with EGL_EXT_image_dma_buf_import");
-      return;
-   }
-
    /* Disallow depth/stencil textures: we don't have a way to pass the
     * separate stencil miptree of a GL_DEPTH_STENCIL texture through.
     */
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 47db1583135..aac96290ded 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -317,7 +317,6 @@ EXT(IBM_texture_mirrored_repeat             , dummy_true
 EXT(INGR_blend_func_separate                , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
 
 EXT(INTEL_conservative_rasterization        , INTEL_conservative_rasterization       ,  x , GLC,  x ,  31, 2013)
-EXT(INTEL_fragment_shader_ordering          , INTEL_fragment_shader_ordering         , GLL, GLC,  x ,  x , 2013)
 EXT(INTEL_performance_query                 , INTEL_performance_query                , GLL, GLC,  x , ES2, 2013)
 EXT(INTEL_shader_atomic_float_minmax        , INTEL_shader_atomic_float_minmax       , GLL, GLC,  x ,  x , 2018)
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 9ed49b7ff24..f30b778a7b1 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4296,7 +4296,6 @@ struct gl_extensions
    GLboolean ATI_fragment_shader;
    GLboolean GREMEDY_string_marker;
    GLboolean INTEL_conservative_rasterization;
-   GLboolean INTEL_fragment_shader_ordering;
    GLboolean INTEL_performance_query;
    GLboolean INTEL_shader_atomic_float_minmax;
    GLboolean KHR_blend_equation_advanced;
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 556c860d393..d9a12d2387f 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -922,6 +922,8 @@ read_pixels_es3_error_check(struct gl_context *ctx, GLenum format, GLenum type,
    case GL_RGBA:
       if (type == GL_FLOAT && data_type == GL_FLOAT)
          return GL_NO_ERROR; /* EXT_color_buffer_float */
+      if (type == GL_HALF_FLOAT && data_type == GL_FLOAT)
+         return GL_NO_ERROR;
       if (type == GL_UNSIGNED_BYTE && data_type == GL_UNSIGNED_NORMALIZED)
          return GL_NO_ERROR;
       if (internalFormat == GL_RGB10_A2 &&
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 0ab9ed445d6..bb4f7006618 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -900,8 +900,7 @@ select_tex_image(const struct gl_texture_object *texObj, GLenum target,
 
 /**
  * Error-check the offset and size arguments to
- * glGet[Compressed]TextureSubImage().  Also checks if the specified
- * texture image is missing.
+ * glGet[Compressed]TextureSubImage().
  * \return true if error, false if no error.
  */
 static bool
@@ -913,6 +912,7 @@ dimensions_error_check(struct gl_context *ctx,
                        const char *caller)
 {
    const struct gl_texture_image *texImage;
+   GLuint imageWidth = 0, imageHeight = 0, imageDepth = 0;
 
    if (xoffset < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
@@ -981,82 +981,44 @@ dimensions_error_check(struct gl_context *ctx,
                      "%s(zoffset + depth = %d)", caller, zoffset + depth);
          return true;
       }
-      /* According to OpenGL 4.6 spec, section 8.11.4 ("Texture Image Queries"):
-       *
-       *   "An INVALID_OPERATION error is generated by GetTextureImage if the
-       *   effective target is TEXTURE_CUBE_MAP or TEXTURE_CUBE_MAP_ARRAY ,
-       *   and the texture object is not cube complete or cube array complete,
-       *   respectively."
-       *
-       * This applies also to GetTextureSubImage, GetCompressedTexImage,
-       * GetCompressedTextureImage, and GetnCompressedTexImage.
-       */
-      if (!_mesa_cube_complete(texObj)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s(cube incomplete)", caller);
-         return true;
-      }
       break;
    default:
       ; /* nothing */
    }
 
    texImage = select_tex_image(texObj, target, level, zoffset);
-   if (!texImage) {
-      /* Trying to return a non-defined level is a valid operation per se, as
-       * OpenGL 4.6 spec, section 8.11.4 ("Texture Image Queries") does not
-       * handle this case as an error.
-       *
-       * Rather, we need to look at section 8.22 ("Texture State and Proxy
-       * State"):
-       *
-       *   "Each initial texture image is null. It has zero width, height, and
-       *    depth, internal format RGBA, or R8 for buffer textures, component
-       *    sizes set to zero and component types set to NONE, the compressed
-       *    flag set to FALSE, a zero compressed size, and the bound buffer
-       *    object name is zero."
-       *
-       * This means we need to assume the image for the non-defined level is
-       * an empty image. With this assumption, we can go back to section
-       * 8.11.4 and checking again the errors:
-       *
-       *   "An INVALID_VALUE error is generated if xoffset + width is greater
-       *    than the texture’s width, yoffset + height is greater than the
-       *    texture’s height, or zoffset + depth is greater than the texture’s
-       *    depth."
-       *
-       * Thus why we return INVALID_VALUE.
-       */
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(missing image)", caller);
-      return true;
+   if (texImage) {
+      imageWidth = texImage->Width;
+      imageHeight = texImage->Height;
+      imageDepth = texImage->Depth;
    }
 
-   if (xoffset + width > texImage->Width) {
+   if (xoffset + width > imageWidth) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(xoffset %d + width %d > %u)",
-                  caller, xoffset, width, texImage->Width);
+                  caller, xoffset, width, imageWidth);
       return true;
    }
 
-   if (yoffset + height > texImage->Height) {
+   if (yoffset + height > imageHeight) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(yoffset %d + height %d > %u)",
-                  caller, yoffset, height, texImage->Height);
+                  caller, yoffset, height, imageHeight);
       return true;
    }
 
    if (target != GL_TEXTURE_CUBE_MAP) {
       /* Cube map error checking was done above */
-      if (zoffset + depth > texImage->Depth) {
+      if (zoffset + depth > imageDepth) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "%s(zoffset %d + depth %d > %u)",
-                     caller, zoffset, depth, texImage->Depth);
+                     caller, zoffset, depth, imageDepth);
          return true;
       }
    }
 
    /* Extra checks for compressed textures */
-   {
+   if (texImage) {
       GLuint bw, bh, bd;
       _mesa_get_format_block_size_3d(texImage->TexFormat, &bw, &bh, &bd);
       if (bw > 1 || bh > 1 || bd > 1) {
@@ -1162,53 +1124,15 @@ pbo_error_check(struct gl_context *ctx, GLenum target,
 
 
 /**
- * Do error checking for all (non-compressed) get-texture-image functions.
- * \return true if any error, false if no errors.
+ * Do teximage-related error checking for getting uncompressed images.
+ * \return true if there was an error
  */
 static bool
-getteximage_error_check(struct gl_context *ctx,
-                        struct gl_texture_object *texObj,
-                        GLenum target, GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type, GLsizei bufSize,
-                        GLvoid *pixels, const char *caller)
+teximage_error_check(struct gl_context *ctx,
+                     struct gl_texture_image *texImage,
+                     GLenum format, const char *caller)
 {
-   struct gl_texture_image *texImage;
-   GLenum baseFormat, err;
-   GLint maxLevels;
-
-   assert(texObj);
-
-   if (texObj->Target == 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
-      return true;
-   }
-
-   maxLevels = _mesa_max_texture_levels(ctx, target);
-   if (level < 0 || level >= maxLevels) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(level = %d)", caller, level);
-      return true;
-   }
-
-   err = _mesa_error_check_format_and_type(ctx, format, type);
-   if (err != GL_NO_ERROR) {
-      _mesa_error(ctx, err, "%s(format/type)", caller);
-      return true;
-   }
-
-   if (dimensions_error_check(ctx, texObj, target, level,
-                              xoffset, yoffset, zoffset,
-                              width, height, depth, caller)) {
-      return true;
-   }
-
-   if (pbo_error_check(ctx, target, width, height, depth,
-                       format, type, bufSize, pixels, caller)) {
-      return true;
-   }
-
-   texImage = select_tex_image(texObj, target, level, zoffset);
+   GLenum baseFormat;
    assert(texImage);
 
    /*
@@ -1241,8 +1165,8 @@ getteximage_error_check(struct gl_context *ctx,
       return true;
    }
    else if (_mesa_is_stencil_format(format)
-	    && !_mesa_is_depthstencil_format(baseFormat)
-	    && !_mesa_is_stencil_format(baseFormat)) {
+            && !_mesa_is_depthstencil_format(baseFormat)
+            && !_mesa_is_stencil_format(baseFormat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(format mismatch)", caller);
       return true;
@@ -1271,6 +1195,142 @@ getteximage_error_check(struct gl_context *ctx,
 }
 
 
+/**
+ * Do common teximage-related error checking for getting uncompressed images.
+ * \return true if there was an error
+ */
+static bool
+common_error_check(struct gl_context *ctx,
+                   struct gl_texture_object *texObj,
+                   GLenum target, GLint level,
+                   GLsizei width, GLsizei height, GLsizei depth,
+                   GLenum format, GLenum type, GLsizei bufSize,
+                   GLvoid *pixels, const char *caller)
+{
+   GLenum err;
+   GLint maxLevels;
+
+   if (texObj->Target == 0) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid texture)", caller);
+      return true;
+   }
+
+   maxLevels = _mesa_max_texture_levels(ctx, target);
+   if (level < 0 || level >= maxLevels) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(level = %d)", caller, level);
+      return true;
+   }
+
+   err = _mesa_error_check_format_and_type(ctx, format, type);
+   if (err != GL_NO_ERROR) {
+      _mesa_error(ctx, err, "%s(format/type)", caller);
+      return true;
+   }
+
+   /* According to OpenGL 4.6 spec, section 8.11.4 ("Texture Image Queries"):
+    *
+    *   "An INVALID_OPERATION error is generated by GetTextureImage if the
+    *   effective target is TEXTURE_CUBE_MAP or TEXTURE_CUBE_MAP_ARRAY ,
+    *   and the texture object is not cube complete or cube array complete,
+    *   respectively."
+    *
+    * This applies also to GetTextureSubImage, GetCompressedTexImage,
+    * GetCompressedTextureImage, and GetnCompressedTexImage.
+    */
+   if (target == GL_TEXTURE_CUBE_MAP && !_mesa_cube_complete(texObj)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(cube incomplete)", caller);
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do error checking for all (non-compressed) get-texture-image functions.
+ * \return true if any error, false if no errors.
+ */
+static bool
+getteximage_error_check(struct gl_context *ctx,
+                        struct gl_texture_object *texObj,
+                        GLenum target, GLint level,
+                        GLsizei width, GLsizei height, GLsizei depth,
+                        GLenum format, GLenum type, GLsizei bufSize,
+                        GLvoid *pixels, const char *caller)
+{
+   struct gl_texture_image *texImage;
+
+   assert(texObj);
+
+   if (common_error_check(ctx, texObj, target, level, width, height, depth,
+                          format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   if (width == 0 || height == 0 || depth == 0) {
+      /* Not an error, but nothing to do.  Return 'true' so that the
+       * caller simply returns.
+       */
+      return true;
+   }
+
+   if (pbo_error_check(ctx, target, width, height, depth,
+                       format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, 0);
+   if (teximage_error_check(ctx, texImage, format, caller)) {
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Do error checking for all (non-compressed) get-texture-image functions.
+ * \return true if any error, false if no errors.
+ */
+static bool
+gettexsubimage_error_check(struct gl_context *ctx,
+                           struct gl_texture_object *texObj,
+                           GLenum target, GLint level,
+                           GLint xoffset, GLint yoffset, GLint zoffset,
+                           GLsizei width, GLsizei height, GLsizei depth,
+                           GLenum format, GLenum type, GLsizei bufSize,
+                           GLvoid *pixels, const char *caller)
+{
+   struct gl_texture_image *texImage;
+
+   assert(texObj);
+
+   if (common_error_check(ctx, texObj, target, level, width, height, depth,
+                          format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   if (dimensions_error_check(ctx, texObj, target, level,
+                              xoffset, yoffset, zoffset,
+                              width, height, depth, caller)) {
+      return true;
+   }
+
+   if (pbo_error_check(ctx, target, width, height, depth,
+                       format, type, bufSize, pixels, caller)) {
+      return true;
+   }
+
+   texImage = select_tex_image(texObj, target, level, zoffset);
+   if (teximage_error_check(ctx, texImage, format, caller)) {
+      return true;
+   }
+
+   return false;
+}
+
+
 /**
  * Return the width, height and depth of a texture image.
  * This function must be resilient to bad parameter values since
@@ -1399,7 +1459,7 @@ _mesa_GetnTexImageARB(GLenum target, GLint level, GLenum format, GLenum type,
    get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
    if (getteximage_error_check(ctx, texObj, target, level,
-                               0, 0, 0, width, height, depth,
+                               width, height, depth,
                                format, type, bufSize, pixels, caller)) {
       return;
    }
@@ -1430,7 +1490,7 @@ _mesa_GetTexImage(GLenum target, GLint level, GLenum format, GLenum type,
    get_texture_image_dims(texObj, target, level, &width, &height, &depth);
 
    if (getteximage_error_check(ctx, texObj, target, level,
-                               0, 0, 0, width, height, depth,
+                               width, height, depth,
                                format, type, INT_MAX, pixels, caller)) {
       return;
    }
@@ -1464,7 +1524,7 @@ _mesa_GetTextureImage(GLuint texture, GLint level, GLenum format, GLenum type,
                           &width, &height, &depth);
 
    if (getteximage_error_check(ctx, texObj, texObj->Target, level,
-                               0, 0, 0, width, height, depth,
+                               width, height, depth,
                                format, type, bufSize, pixels, caller)) {
       return;
    }
@@ -1497,9 +1557,10 @@ _mesa_GetTextureSubImage(GLuint texture, GLint level,
       return;
    }
 
-   if (getteximage_error_check(ctx, texObj, texObj->Target, level,
-                               xoffset, yoffset, zoffset, width, height, depth,
-                               format, type, bufSize, pixels, caller)) {
+   if (gettexsubimage_error_check(ctx, texObj, texObj->Target, level,
+                                  xoffset, yoffset, zoffset,
+                                  width, height, depth,
+                                  format, type, bufSize, pixels, caller)) {
       return;
    }
 
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index a46c9f94bca..8eccdc20b76 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -40,6 +40,7 @@
 #include "shaderapi.h"
 #include "shaderobj.h"
 
+#include "program/program.h"
 #include "program/prog_parameter.h"
 
 struct using_program_tuple
@@ -470,6 +471,7 @@ begin_transform_feedback(struct gl_context *ctx, GLenum mode, bool no_error)
 
    if (obj->program != source) {
       ctx->NewDriverState |= ctx->DriverFlags.NewTransformFeedbackProg;
+      _mesa_reference_program_(ctx, &obj->program, source);
       obj->program = source;
    }
 
@@ -504,6 +506,7 @@ end_transform_feedback(struct gl_context *ctx,
    assert(ctx->Driver.EndTransformFeedback);
    ctx->Driver.EndTransformFeedback(ctx, obj);
 
+   _mesa_reference_program_(ctx, &obj->program, NULL);
    ctx->TransformFeedback.CurrentObject->Active = GL_FALSE;
    ctx->TransformFeedback.CurrentObject->Paused = GL_FALSE;
    ctx->TransformFeedback.CurrentObject->EndedAnytime = GL_TRUE;
diff --git a/src/mesa/program/Android.mk b/src/mesa/program/Android.mk
index c6470e6289e..13d0da85882 100644
--- a/src/mesa/program/Android.mk
+++ b/src/mesa/program/Android.mk
@@ -41,7 +41,7 @@ endef
 include $(MESA_TOP)/src/mesa/Makefile.sources
 
 include $(CLEAR_VARS)
-
+LOCAL_CFLAGS += -Wno-error
 LOCAL_MODULE := libmesa_program
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
 LOCAL_STATIC_LIBRARIES := libmesa_nir \
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 0e535257cb4..bdc8dda82c2 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -285,8 +285,11 @@ st_renderbuffer_delete(struct gl_context *ctx, struct gl_renderbuffer *rb)
       struct st_context *st = st_context(ctx);
       pipe_surface_release(st->pipe, &strb->surface_srgb);
       pipe_surface_release(st->pipe, &strb->surface_linear);
-      strb->surface = NULL;
+   } else {
+      pipe_surface_release_no_context(&strb->surface_srgb);
+      pipe_surface_release_no_context(&strb->surface_linear);
    }
+   strb->surface = NULL;
    pipe_resource_reference(&strb->texture, NULL);
    free(strb->data);
    _mesa_delete_renderbuffer(ctx, rb);
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index c58deadc957..581a8639ef0 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -749,7 +749,8 @@ st_link_nir(struct gl_context *ctx,
           * the pipe_stream_output->output_register field is based on the
           * pre-compacted driver_locations.
           */
-         if (!prev_shader->sh.LinkedTransformFeedback)
+         if (!(prev_shader->sh.LinkedTransformFeedback &&
+               prev_shader->sh.LinkedTransformFeedback->NumVarying > 0))
             nir_compact_varyings(shader_program->_LinkedShaders[prev]->Program->nir,
                               nir, ctx->API != API_OPENGL_COMPAT);
       }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 5322903b93a..0783f67f2b7 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4072,7 +4072,6 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
    case ir_intrinsic_generic_atomic_comp_swap:
    case ir_intrinsic_begin_invocation_interlock:
    case ir_intrinsic_end_invocation_interlock:
-   case ir_intrinsic_begin_fragment_shader_ordering:
       unreachable("Invalid intrinsic");
    }
 }
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index ceb48dd4903..776b563e50e 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -1069,15 +1069,6 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi,
        * of the referenced drawables no longer exist.
        */
       st_framebuffers_purge(st);
-
-      /* Notify the driver that the context thread may have been changed.
-       * This should pin all driver threads to a specific L3 cache for optimal
-       * performance on AMD Zen CPUs.
-       */
-      struct glthread_state *glthread = st->ctx->GLThread;
-      thrd_t *upper_thread = glthread ? &glthread->queue.threads[0] : NULL;
-
-      util_context_thread_changed(st->pipe, upper_thread);
    }
    else {
       ret = _mesa_make_current(NULL, NULL, NULL);
diff --git a/src/meson.build b/src/meson.build
index 73146d37143..3b91c6a88c5 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -51,8 +51,12 @@ subdir('util')
 subdir('mapi')
 # TODO: opengl
 subdir('compiler')
-subdir('egl/wayland/wayland-drm')
-subdir('vulkan')
+if with_platform_wayland
+  subdir('egl/wayland/wayland-drm')
+endif
+if with_any_vk
+  subdir('vulkan')
+endif
 if with_gallium_radeonsi or with_amd_vk
   subdir('amd')
 endif
@@ -67,7 +71,7 @@ subdir('loader')
 if with_platform_haiku
   subdir('hgl')
 endif
-if with_glx != 'disabled'
+if with_glx == 'dri'
   subdir('glx')
 endif
 if with_gbm
diff --git a/src/util/Android.mk b/src/util/Android.mk
index 2d59e1ae15e..6d770ca9575 100644
--- a/src/util/Android.mk
+++ b/src/util/Android.mk
@@ -41,8 +41,14 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
+# If Android version >=8 MESA should static link libexpat else should dynamic link
+ifeq ($(shell test $(PLATFORM_SDK_VERSION) -ge 27; echo $$?), 0)
+LOCAL_STATIC_LIBRARIES := \
+	libexpat
+else
 LOCAL_SHARED_LIBRARIES := \
 	libexpat
+endif
 
 LOCAL_MODULE := libmesa_util
 
diff --git a/src/util/Makefile.am b/src/util/Makefile.am
index b857db8a866..4bda54c551d 100644
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -60,7 +60,8 @@ libmesautil_la_LIBADD = \
 	$(PTHREAD_LIBS) \
 	$(CLOCK_LIB) \
 	$(ZLIB_LIBS) \
-	$(LIBATOMIC_LIBS)
+	$(LIBATOMIC_LIBS) \
+	-lm
 
 libxmlconfig_la_SOURCES = $(XMLCONFIG_FILES)
 libxmlconfig_la_CFLAGS = \
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index dc89ac93f28..cdfecafaf01 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -112,6 +112,31 @@ u_bit_scan64(uint64_t *mask)
    return i;
 }
 
+/* Count bits set in mask */
+static inline int
+u_count_bits(unsigned *mask)
+{
+   unsigned v = *mask;
+   int c;
+   v = v - ((v >> 1) & 0x55555555);
+   v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+   v = (v + (v >> 4)) & 0xF0F0F0F;
+   c = (int)((v * 0x1010101) >> 24);
+   return c;
+}
+
+static inline int
+u_count_bits64(uint64_t *mask)
+{
+   uint64_t v = *mask;
+   int c;
+   v = v - ((v >> 1) & 0x5555555555555555ull);
+   v = (v & 0x3333333333333333ull) + ((v >> 2) & 0x3333333333333333ull);
+   v = (v + (v >> 4)) & 0xF0F0F0F0F0F0F0Full;
+   c = (int)((v * 0x101010101010101ull) >> 56);
+   return c;
+}
+
 /* Determine if an unsigned value is a power of two.
  *
  * \note
diff --git a/src/util/meson.build b/src/util/meson.build
index 7caea27d660..156621aff65 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -119,7 +119,7 @@ libmesa_util = static_library(
   'mesa_util',
   [files_mesa_util, format_srgb],
   include_directories : inc_common,
-  dependencies : [dep_zlib, dep_clock, dep_thread, dep_atomic],
+  dependencies : [dep_zlib, dep_clock, dep_thread, dep_atomic, dep_m],
   c_args : [c_msvc_compat_args, c_vis_args],
   build_by_default : false
 )
diff --git a/src/util/ralloc.c b/src/util/ralloc.c
index 5d77f75ee85..5a7fa7e84e9 100644
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -554,10 +554,18 @@ ralloc_vasprintf_rewrite_tail(char **str, size_t *start, const char *fmt,
  */
 
 #define MIN_LINEAR_BUFSIZE 2048
-#define SUBALLOC_ALIGNMENT sizeof(uintptr_t)
+#define SUBALLOC_ALIGNMENT 8
 #define LMAGIC 0x87b9c7d3
 
-struct linear_header {
+struct
+#ifdef _MSC_VER
+ __declspec(align(8))
+#elif defined(__LP64__)
+ __attribute__((aligned(16)))
+#else
+ __attribute__((aligned(8)))
+#endif
+   linear_header {
 #ifdef DEBUG
    unsigned magic;   /* for debugging */
 #endif
@@ -651,6 +659,8 @@ linear_alloc_child(void *parent, unsigned size)
    ptr = (linear_size_chunk *)((char*)&latest[1] + latest->offset);
    ptr->size = size;
    latest->offset += full_size;
+
+   assert((uintptr_t)&ptr[1] % SUBALLOC_ALIGNMENT == 0);
    return &ptr[1];
 }
 
diff --git a/src/vulkan/Android.mk b/src/vulkan/Android.mk
index 6253f1c3be9..730d036d18c 100644
--- a/src/vulkan/Android.mk
+++ b/src/vulkan/Android.mk
@@ -32,12 +32,15 @@ include $(LOCAL_PATH)/Makefile.sources
 include $(CLEAR_VARS)
 LOCAL_MODULE := libmesa_vulkan_util
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
-
+LOCAL_HEADER_LIBRARIES += libcutils_headers libsystem_headers
 intermediates := $(call local-generated-sources-dir)
 
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/include/vulkan \
-	$(MESA_TOP)/src/vulkan/util
+	$(MESA_TOP)/src/vulkan/util \
+	frameworks/native/libs/nativebase/include \
+	frameworks/native/libs/nativewindow/include \
+	frameworks/native/libs/arect/include
 
 LOCAL_GENERATED_SOURCES := $(addprefix $(intermediates)/, \
 	$(VULKAN_UTIL_GENERATED_FILES))
diff --git a/src/vulkan/wsi/wsi_common.c b/src/vulkan/wsi/wsi_common.c
index 1cd5f8d62c5..58e25214149 100644
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@@ -954,8 +954,8 @@ wsi_common_queue_present(const struct wsi_device *wsi,
          /* We only need/want to wait on semaphores once.  After that, we're
           * guaranteed ordering since it all happens on the same queue.
           */
-         submit_info.waitSemaphoreCount = pPresentInfo->waitSemaphoreCount,
-         submit_info.pWaitSemaphores = pPresentInfo->pWaitSemaphores,
+         submit_info.waitSemaphoreCount = pPresentInfo->waitSemaphoreCount;
+         submit_info.pWaitSemaphores = pPresentInfo->pWaitSemaphores;
 
          /* Set up the pWaitDstStageMasks */
          stage_flags = vk_alloc(&swapchain->alloc,
diff --git a/src/vulkan/wsi/wsi_common_display.c b/src/vulkan/wsi/wsi_common_display.c
index fd0d30ad80c..856040b4fe1 100644
--- a/src/vulkan/wsi/wsi_common_display.c
+++ b/src/vulkan/wsi/wsi_common_display.c
@@ -1062,6 +1062,8 @@ wsi_display_swapchain_destroy(struct wsi_swapchain *drv_chain,
 
    for (uint32_t i = 0; i < chain->base.image_count; i++)
       wsi_display_image_finish(drv_chain, allocator, &chain->images[i]);
+
+   wsi_swapchain_finish(&chain->base);
    vk_free(allocator, chain);
    return VK_SUCCESS;
 }
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index e9cc22ec603..3d3a60167bf 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -455,10 +455,11 @@ wsi_wl_get_presentation_support(struct wsi_device *wsi_device,
       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
 
    struct wsi_wl_display display;
-   int ret = wsi_wl_display_init(wsi, &display, wl_display, false);
-   wsi_wl_display_finish(&display);
+   VkResult ret = wsi_wl_display_init(wsi, &display, wl_display, false);
+   if (ret == VK_SUCCESS)
+      wsi_wl_display_finish(&display);
 
-   return ret == 0;
+   return ret == VK_SUCCESS;
 }
 
 static VkResult