Implement async data/subData.  Implement TriangleMeshBuilder in SimpleMesh which replaces TriangleMesh.  Update Film to use new builder.
diff --git a/libs/rs/java/Film/res/raw/filmstrip.c b/libs/rs/java/Film/res/raw/filmstrip.c
index 255d908..8f3d930 100644
--- a/libs/rs/java/Film/res/raw/filmstrip.c
+++ b/libs/rs/java/Film/res/raw/filmstrip.c
@@ -24,15 +24,15 @@
 
     float trans = Pos_translate;
     float rot = Pos_rotate;
+
     matrixLoadScale(mat1, 2.f, 2.f, 2.f);
     matrixTranslate(mat1, 0.f, 0.f, trans);
     matrixRotate(mat1, 90.f, 0.f, 0.f, 1.f);
     matrixRotate(mat1, rot, 1.f, 0.f, 0.f);
-    storeMatrix(3, 0, mat1);
+    vpLoadModelMatrix(mat1);
 
     // Draw the lighting effect in the strip and fill the Z buffer.
-    drawTriangleMesh(NAMED_mesh);
-
+    drawSimpleMesh(NAMED_mesh);
 
     // Start of images.
     bindProgramFragmentStore(NAMED_PSImages);
@@ -74,31 +74,21 @@
         pos = pos - 0.75f;
 
         offset = offset + triangleOffsetsCount / 2;
-
-    int drawit = 1;
-    if (offset < 0) {
-        drawit = 0;
-    }
-    if (offset >= triangleOffsetsCount) {
-        drawit = 0;
-    }
-
-        //if (!((offset < 0) || (offset >= triangleOffsetsCount))) {
-        if (drawit) {
+        if (!((offset < 0) || (offset >= triangleOffsetsCount))) {
             int start = offset -2;
             int end = offset + 2;
 
             if (start < 0) {
                 start = 0;
             }
-            if (end > triangleOffsetsCount) {
-                end = triangleOffsetsCount;
+            if (end >= triangleOffsetsCount) {
+                end = triangleOffsetsCount-1;
             }
 
             bindTexture(NAMED_PFImages, 0, loadI32(0, imgId - 1));
             matrixLoadTranslate(mat1, -pos - loadF(5, triangleOffsetsCount / 2), 0, 0);
             vpLoadTextureMatrix(mat1);
-            drawTriangleMeshRange(NAMED_mesh, loadI32(4, start), loadI32(4, end) - loadI32(4, start));
+            drawSimpleMeshRange(NAMED_mesh, loadI32(4, start), (loadI32(4, end) - loadI32(4, start)));
         }
     }
     return 0;
diff --git a/libs/rs/java/Film/src/com/android/film/FilmRS.java b/libs/rs/java/Film/src/com/android/film/FilmRS.java
index e6cd52d..cee827b 100644
--- a/libs/rs/java/Film/src/com/android/film/FilmRS.java
+++ b/libs/rs/java/Film/src/com/android/film/FilmRS.java
@@ -68,8 +68,6 @@
     private RenderScript mRS;
     private Script mScriptStrip;
     private Script mScriptImage;
-    private Element mElementVertex;
-    private Element mElementIndex;
     private Sampler mSampler;
     private ProgramStore mPSBackground;
     private ProgramStore mPSImages;
@@ -88,7 +86,7 @@
     private Allocation mAllocOffsetsTex;
     private Allocation mAllocOffsets;
 
-    private RenderScript.TriangleMesh mMesh;
+    private SimpleMesh mMesh;
     private Light mLight;
 
     private FilmStripMesh mFSM;
@@ -186,7 +184,6 @@
                 mip++;
                 a.setConstraint(Dimension.LOD, mip);
             }
-            a.destroy();
 
             mImages[ct].uploadToTexture(1);
             mBufferIDs[ct] = mImages[ct].getID();
@@ -204,13 +201,8 @@
     }
 
     private void initRS() {
-        mElementVertex = Element.NORM_ST_XYZ_F32;
-        mElementIndex = Element.INDEX_16;
-
-        mRS.triangleMeshBegin(mElementVertex, mElementIndex);
         mFSM = new FilmStripMesh();
-        mFSM.init(mRS);
-        mMesh = mRS.triangleMeshCreate();
+        mMesh = mFSM.init(mRS);
         mMesh.setName("mesh");
 
         initPFS();
diff --git a/libs/rs/java/Film/src/com/android/film/FilmStripMesh.java b/libs/rs/java/Film/src/com/android/film/FilmStripMesh.java
index 02bffd8..64aac26 100644
--- a/libs/rs/java/Film/src/com/android/film/FilmStripMesh.java
+++ b/libs/rs/java/Film/src/com/android/film/FilmStripMesh.java
@@ -22,6 +22,7 @@
 import android.util.Log;
 
 import android.renderscript.RenderScript;
+import android.renderscript.SimpleMesh;
 
 
 class FilmStripMesh {
@@ -72,27 +73,23 @@
             dx /= len;
             dy /= len;
             dz /= len;
-        
+
             nx = dx * dz;
             ny = dy * dz;
             nz = (float)java.lang.Math.sqrt(dx*dx + dy*dy);
-        
+
             len = (float)java.lang.Math.sqrt(nx*nx + ny*ny + nz*nz);
             nx /= len;
             ny /= len;
             nz /= len;
         }
-
-        void addToRS(RenderScript rs) {
-            rs.triangleMeshAddVertex_XYZ_ST_NORM(x, y, z, s, t, nx, ny, nz);
-        }
     }
 
     int[] mTriangleOffsets;
     float[] mTriangleOffsetsTex;
     int mTriangleOffsetsCount;
 
-    void init(RenderScript rs)
+    SimpleMesh init(RenderScript rs)
     {
         float vtx[] = new float[] {
             60.431003f, 124.482050f,
@@ -203,11 +200,11 @@
              -60.862074f, 120.872604f,
              -60.431003f, 124.482050f
         };
-    
-    
+
+
         mTriangleOffsets = new int[64];
         mTriangleOffsetsTex = new float[64];
-    
+
         mTriangleOffsets[0] = 0;
         mTriangleOffsetsCount = 1;
 
@@ -215,6 +212,8 @@
         t.nxyz(1, 0, 0);
         int count = vtx.length / 2;
 
+        SimpleMesh.TriangleMeshBuilder tm = new SimpleMesh.TriangleMeshBuilder(rs, 3, true, true);
+
         float runningS = 0;
         for (int ct=0; ct < (count-1); ct++) {
             t.x = -vtx[ct*2] / 100.f;
@@ -228,16 +227,15 @@
             t.ny /= len;
             t.y = -0.5f;
             t.t = 0;
-            //Log.e("xx", "vtx " + t.x + "  " + t.y + "  " + t.z);
-            t.addToRS(rs);
+            tm.add_XYZ_ST_NORM(t.x, t.y, t.z, t.s, t.t, t.nx, t.ny, t.nz);
+            //android.util.Log.e("rs", "vtx x="+t.x+" y="+t.y+" z="+t.z+" s="+t.s+" t="+t.t);
             t.y = .5f;
             t.t = 1;
-            t.addToRS(rs);
+            tm.add_XYZ_ST_NORM(t.x, t.y, t.z, t.s, t.t, t.nx, t.ny, t.nz);
+            //android.util.Log.e("rs", "vtx x="+t.x+" y="+t.y+" z="+t.z+" s="+t.s+" t="+t.t);
 
-            //LOGE(" %f", runningS);
             if((runningS*2) > mTriangleOffsetsCount) {
-                //LOGE("**** img %i  %i", gTriangleOffsetsCount, ct*2);
-                mTriangleOffsets[mTriangleOffsetsCount] = ct*2;
+                mTriangleOffsets[mTriangleOffsetsCount] = ct*2 * 3;
                 mTriangleOffsetsTex[mTriangleOffsetsCount] = t.s;
                 mTriangleOffsetsCount ++;
             }
@@ -245,9 +243,10 @@
 
         count = (count * 2 - 2);
         for (int ct=0; ct < (count-2); ct+= 2) {
-            rs.triangleMeshAddTriangle(ct, ct+1, ct+2);
-            rs.triangleMeshAddTriangle(ct+1, ct+3, ct+2);
+            tm.addTriangle(ct, ct+1, ct+2);
+            tm.addTriangle(ct+1, ct+3, ct+2);
         }
+        return tm.create();
     }
 
 
diff --git a/libs/rs/rs.spec b/libs/rs/rs.spec
index e275f27..cb4dd00 100644
--- a/libs/rs/rs.spec
+++ b/libs/rs/rs.spec
@@ -130,6 +130,9 @@
 AllocationData {
 	param RsAllocation va
 	param const void * data
+	param uint32_t bytes
+	handcodeApi
+	togglePlay
 	}
 
 Allocation1DSubData {
@@ -137,6 +140,9 @@
 	param uint32_t xoff
 	param uint32_t count
 	param const void *data
+	param uint32_t bytes
+	handcodeApi
+	togglePlay
 	}
 
 Allocation2DSubData {
@@ -146,6 +152,7 @@
 	param uint32_t w
 	param uint32_t h
 	param const void *data
+	param uint32_t bytes
 	}
 
 AllocationRead {
diff --git a/libs/rs/rsAdapter.cpp b/libs/rs/rsAdapter.cpp
index 3242e11..d20e910 100644
--- a/libs/rs/rsAdapter.cpp
+++ b/libs/rs/rsAdapter.cpp
@@ -72,7 +72,7 @@
 RsAdapter1D rsi_Adapter1DCreate(Context *rsc)
 {
     Adapter1D *a = new Adapter1D();
-    a->incRef();
+    a->incUserRef();
     return a;
 }
 
@@ -185,7 +185,7 @@
 RsAdapter2D rsi_Adapter2DCreate(Context *rsc)
 {
     Adapter2D *a = new Adapter2D();
-    a->incRef();
+    a->incUserRef();
     return a;
 }
 
diff --git a/libs/rs/rsAllocation.cpp b/libs/rs/rsAllocation.cpp
index 3cb76bc..1f49ca1 100644
--- a/libs/rs/rsAllocation.cpp
+++ b/libs/rs/rsAllocation.cpp
@@ -115,9 +115,14 @@
 }
 
 
-void Allocation::data(const void *data)
+void Allocation::data(const void *data, uint32_t sizeBytes)
 {
-    memcpy(mPtr, data, mType->getSizeBytes());
+    uint32_t size = mType->getSizeBytes();
+    if (size != sizeBytes) {
+        LOGE("Allocation::data called with mismatched size expected %i, got %i", size, sizeBytes);
+        return;
+    }
+    memcpy(mPtr, data, size);
 }
 
 void Allocation::read(void *data)
@@ -125,16 +130,22 @@
     memcpy(data, mPtr, mType->getSizeBytes());
 }
 
-void Allocation::subData(uint32_t xoff, uint32_t count, const void *data)
+void Allocation::subData(uint32_t xoff, uint32_t count, const void *data, uint32_t sizeBytes)
 {
     uint32_t eSize = mType->getElementSizeBytes();
     uint8_t * ptr = static_cast<uint8_t *>(mPtr);
     ptr += eSize * xoff;
-    memcpy(ptr, data, count * eSize);
+    uint32_t size = count * eSize;
+
+    if (size != sizeBytes) {
+        LOGE("Allocation::subData called with mismatched size expected %i, got %i", size, sizeBytes);
+        return;
+    }
+    memcpy(ptr, data, size);
 }
 
 void Allocation::subData(uint32_t xoff, uint32_t yoff,
-             uint32_t w, uint32_t h, const void *data)
+             uint32_t w, uint32_t h, const void *data, uint32_t sizeBytes)
 {
     uint32_t eSize = mType->getElementSizeBytes();
     uint32_t lineSize = eSize * w;
@@ -143,6 +154,12 @@
     const uint8_t *src = static_cast<const uint8_t *>(data);
     uint8_t *dst = static_cast<uint8_t *>(mPtr);
     dst += eSize * (xoff + yoff * destW);
+
+    if ((lineSize * eSize * h) != sizeBytes) {
+        rsAssert(!"Allocation::subData called with mismatched size");
+        return;
+    }
+
     for (uint32_t line=yoff; line < (yoff+h); line++) {
         uint8_t * ptr = static_cast<uint8_t *>(mPtr);
         memcpy(dst, src, lineSize);
@@ -152,7 +169,7 @@
 }
 
 void Allocation::subData(uint32_t xoff, uint32_t yoff, uint32_t zoff,
-             uint32_t w, uint32_t h, uint32_t d, const void *data)
+             uint32_t w, uint32_t h, uint32_t d, const void *data, uint32_t sizeBytes)
 {
 }
 
@@ -170,7 +187,7 @@
     const Type * type = static_cast<const Type *>(vtype);
 
     Allocation * alloc = new Allocation(type);
-    alloc->incRef();
+    alloc->incUserRef();
     return alloc;
 }
 
@@ -340,7 +357,7 @@
         LOGE("Memory allocation failure");
         return NULL;
     }
-    texAlloc->incRef();
+    texAlloc->incUserRef();
 
     ElementConverter_t cvt = pickConverter(dstFmt, srcFmt);
     cvt(texAlloc->getPtr(), data, w * h);
@@ -451,7 +468,7 @@
 
     RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, type);
     Allocation *texAlloc = static_cast<Allocation *>(vTexAlloc);
-    texAlloc->incRef();
+    texAlloc->incUserRef();
     if (texAlloc == NULL) {
         LOGE("Memory allocation failure");
         fclose(f);
@@ -503,24 +520,24 @@
     return texAlloc;
 }
 
-void rsi_AllocationData(Context *rsc, RsAllocation va, const void *data)
+void rsi_AllocationData(Context *rsc, RsAllocation va, const void *data, uint32_t sizeBytes)
 {
     Allocation *a = static_cast<Allocation *>(va);
-    a->data(data);
+    a->data(data, sizeBytes);
     rsc->allocationCheck(a);
 }
 
-void rsi_Allocation1DSubData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t count, const void *data)
+void rsi_Allocation1DSubData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t count, const void *data, uint32_t sizeBytes)
 {
     Allocation *a = static_cast<Allocation *>(va);
-    a->subData(xoff, count, data);
+    a->subData(xoff, count, data, sizeBytes);
     rsc->allocationCheck(a);
 }
 
-void rsi_Allocation2DSubData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h, const void *data)
+void rsi_Allocation2DSubData(Context *rsc, RsAllocation va, uint32_t xoff, uint32_t yoff, uint32_t w, uint32_t h, const void *data, uint32_t sizeBytes)
 {
     Allocation *a = static_cast<Allocation *>(va);
-    a->subData(xoff, yoff, w, h, data);
+    a->subData(xoff, yoff, w, h, data, sizeBytes);
     rsc->allocationCheck(a);
 }
 
diff --git a/libs/rs/rsAllocation.h b/libs/rs/rsAllocation.h
index 00af9ed..1f58ec5 100644
--- a/libs/rs/rsAllocation.h
+++ b/libs/rs/rsAllocation.h
@@ -53,12 +53,12 @@
     uint32_t getBufferObjectID() const {return mBufferID;}
 
 
-    void data(const void *data);
-    void subData(uint32_t xoff, uint32_t count, const void *data);
+    void data(const void *data, uint32_t sizeBytes);
+    void subData(uint32_t xoff, uint32_t count, const void *data, uint32_t sizeBytes);
     void subData(uint32_t xoff, uint32_t yoff,
-                 uint32_t w, uint32_t h, const void *data);
+                 uint32_t w, uint32_t h, const void *data, uint32_t sizeBytes);
     void subData(uint32_t xoff, uint32_t yoff, uint32_t zoff,
-                 uint32_t w, uint32_t h, uint32_t d, const void *data);
+                 uint32_t w, uint32_t h, uint32_t d, const void *data, uint32_t sizeBytes);
 
     void read(void *data);
 
diff --git a/libs/rs/rsContext.cpp b/libs/rs/rsContext.cpp
index 52c2b78..c28bd02 100644
--- a/libs/rs/rsContext.cpp
+++ b/libs/rs/rsContext.cpp
@@ -45,6 +45,7 @@
         configAttribsPtr[1] = 16;
         configAttribsPtr += 2;
     }
+
     configAttribsPtr[0] = EGL_NONE;
     rsAssert(configAttribsPtr < (configAttribs + (sizeof(configAttribs) / sizeof(EGLint))));
 
@@ -53,7 +54,7 @@
 
     status_t err = EGLUtils::selectConfigForNativeWindow(mEGL.mDisplay, configAttribs, mWndSurface, &mEGL.mConfig);
     if (err) {
-     LOGE("couldn't find an EGLConfig matching the screen format\n");
+       LOGE("couldn't find an EGLConfig matching the screen format\n");
     }
     //eglChooseConfig(mEGL.mDisplay, configAttribs, &mEGL.mConfig, 1, &mEGL.mNumConfigs);
 
@@ -76,11 +77,11 @@
     mGL.mRenderer = glGetString(GL_RENDERER);
     mGL.mExtensions = glGetString(GL_EXTENSIONS);
 
-    //LOGV("EGL Version %i %i", mEGL.mMajorVersion, mEGL.mMinorVersion);
-    //LOGV("GL Version %s", mGL.mVersion);
-    //LOGV("GL Vendor %s", mGL.mVendor);
-    //LOGV("GL Renderer %s", mGL.mRenderer);
-    //LOGV("GL Extensions %s", mGL.mExtensions);
+    LOGV("EGL Version %i %i", mEGL.mMajorVersion, mEGL.mMinorVersion);
+    LOGV("GL Version %s", mGL.mVersion);
+    LOGV("GL Vendor %s", mGL.mVendor);
+    LOGV("GL Renderer %s", mGL.mRenderer);
+    LOGV("GL Extensions %s", mGL.mExtensions);
 
     if ((strlen((const char *)mGL.mVersion) < 12) || memcmp(mGL.mVersion, "OpenGL ES-CM", 12)) {
         LOGE("Error, OpenGL ES Lite not supported");
@@ -432,7 +433,7 @@
         }
 
         for (size_t ct = 0; ct < mObjDestroy.mDestroyList.size(); ct++) {
-            mObjDestroy.mDestroyList[ct]->decRef();
+            mObjDestroy.mDestroyList[ct]->decUserRef();
         }
         mObjDestroy.mDestroyList.clear();
         mObjDestroy.mNeedToEmpty = false;
@@ -522,7 +523,7 @@
 {
     ObjectBase *ob = static_cast<ObjectBase *>(obj);
     rsc->removeName(ob);
-    ob->decRef();
+    ob->decUserRef();
 }
 
 void rsi_ContextSetDefineF(Context *rsc, const char* name, float value)
diff --git a/libs/rs/rsElement.cpp b/libs/rs/rsElement.cpp
index 389b2c0..6794522 100644
--- a/libs/rs/rsElement.cpp
+++ b/libs/rs/rsElement.cpp
@@ -215,7 +215,7 @@
     rsAssert(!mComponents[idx].get());
     rsAssert(idx < mComponentCount);
     mComponents[idx].set(c);
-    c->incRef();
+    c->incUserRef();
 }
 
 
@@ -387,7 +387,7 @@
 
     rsAssert(sec->mPredefinedList[predef].mEnum == predef);
     Element * e = sec->mPredefinedList[predef].mElement;
-    e->incRef();
+    e->incUserRef();
     return e;
 }
 
@@ -412,7 +412,7 @@
     }
 
     rsc->mStateElement.mComponentBuildList.clear();
-    se->incRef();
+    se->incUserRef();
     return se;
 }
 
diff --git a/libs/rs/rsHandcode.h b/libs/rs/rsHandcode.h
new file mode 100644
index 0000000..800eddd
--- /dev/null
+++ b/libs/rs/rsHandcode.h
@@ -0,0 +1,47 @@
+
+#define DATA_SYNC_SIZE 1024
+
+static inline void rsHCAPI_AllocationData (RsContext rsc, RsAllocation va, const void * data, uint32_t sizeBytes)
+{
+    ThreadIO *io = &((Context *)rsc)->mIO;
+    uint32_t size = sizeof(RS_CMD_AllocationData);
+    if (sizeBytes < DATA_SYNC_SIZE) {
+        size += (sizeBytes + 3) & ~3;
+    }
+    RS_CMD_AllocationData *cmd = static_cast<RS_CMD_AllocationData *>(io->mToCore.reserve(size));
+    cmd->va = va;
+    cmd->bytes = sizeBytes;
+    cmd->data = data;
+    if (sizeBytes < DATA_SYNC_SIZE) {
+        cmd->data = (void *)(cmd+1);
+        memcpy(cmd+1, data, sizeBytes);
+        io->mToCore.commit(RS_CMD_ID_AllocationData, size);
+    } else {
+        io->mToCore.commitSync(RS_CMD_ID_AllocationData, size);
+    }
+}
+
+
+static inline void rsHCAPI_Allocation1DSubData (RsContext rsc, RsAllocation va, uint32_t xoff, uint32_t count, const void * data, uint32_t sizeBytes)
+{
+    ThreadIO *io = &((Context *)rsc)->mIO;
+    uint32_t size = sizeof(RS_CMD_Allocation1DSubData);
+    if (sizeBytes < DATA_SYNC_SIZE) {
+        size += (sizeBytes + 3) & ~3;
+    }
+    RS_CMD_Allocation1DSubData *cmd = static_cast<RS_CMD_Allocation1DSubData *>(io->mToCore.reserve(size));
+    cmd->va = va;
+    cmd->xoff = xoff;
+    cmd->count = count;
+    cmd->data = data;
+    cmd->bytes = sizeBytes;
+    if (sizeBytes < DATA_SYNC_SIZE) {
+        cmd->data = (void *)(cmd+1);
+        memcpy(cmd+1, data, sizeBytes);
+        io->mToCore.commit(RS_CMD_ID_Allocation1DSubData, size);
+    } else {
+        io->mToCore.commitSync(RS_CMD_ID_Allocation1DSubData, size);
+    }
+
+}
+
diff --git a/libs/rs/rsLight.cpp b/libs/rs/rsLight.cpp
index f780e52..ad06c1f 100644
--- a/libs/rs/rsLight.cpp
+++ b/libs/rs/rsLight.cpp
@@ -106,7 +106,7 @@
 {
     Light *l = new Light(rsc->mStateLight.mIsLocal,
                          rsc->mStateLight.mIsMono);
-    l->incRef();
+    l->incUserRef();
     return l;
 }
 
diff --git a/libs/rs/rsObjectBase.cpp b/libs/rs/rsObjectBase.cpp
index 07bbc1e..7e7afab 100644
--- a/libs/rs/rsObjectBase.cpp
+++ b/libs/rs/rsObjectBase.cpp
@@ -21,28 +21,51 @@
 
 ObjectBase::ObjectBase()
 {
-    mRefCount = 0;
+    mUserRefCount = 0;
+    mSysRefCount = 0;
     mName = NULL;
 }
 
 ObjectBase::~ObjectBase()
 {
     //LOGV("~ObjectBase %p  ref %i", this, mRefCount);
-    rsAssert(!mRefCount);
+    rsAssert(!mUserRefCount);
+    rsAssert(!mSysRefCount);
 }
 
-void ObjectBase::incRef() const
+void ObjectBase::incUserRef() const
 {
-    mRefCount ++;
+    mUserRefCount ++;
     //LOGV("ObjectBase %p inc ref %i", this, mRefCount);
 }
 
-void ObjectBase::decRef() const
+void ObjectBase::incSysRef() const
 {
-    rsAssert(mRefCount > 0);
-    mRefCount --;
+    mSysRefCount ++;
+    //LOGV("ObjectBase %p inc ref %i", this, mRefCount);
+}
+
+void ObjectBase::decUserRef() const
+{
+    rsAssert(mUserRefCount > 0);
+    mUserRefCount --;
     //LOGV("ObjectBase %p dec ref %i", this, mRefCount);
-    if (!mRefCount) {
+    if (!(mSysRefCount | mUserRefCount)) {
+        if (mName) {
+            LOGV("Deleting RS object %p, name %s", this, mName);
+        } else {
+            LOGV("Deleting RS object %p, no name", this);
+        }
+        delete this;
+    }
+}
+
+void ObjectBase::decSysRef() const
+{
+    rsAssert(mSysRefCount > 0);
+    mSysRefCount --;
+    //LOGV("ObjectBase %p dec ref %i", this, mRefCount);
+    if (!(mSysRefCount | mUserRefCount)) {
         if (mName) {
             LOGV("Deleting RS object %p, name %s", this, mName);
         } else {
diff --git a/libs/rs/rsObjectBase.h b/libs/rs/rsObjectBase.h
index b2c3338..d1e6baa 100644
--- a/libs/rs/rsObjectBase.h
+++ b/libs/rs/rsObjectBase.h
@@ -30,8 +30,11 @@
     ObjectBase();
     virtual ~ObjectBase();
 
-    void incRef() const;
-    void decRef() const;
+    void incSysRef() const;
+    void decSysRef() const;
+
+    void incUserRef() const;
+    void decUserRef() const;
 
     const char * getName() const {
         return mName;
@@ -41,13 +44,14 @@
 
 private:
     char * mName;
-    mutable int32_t mRefCount;
+    mutable int32_t mSysRefCount;
+    mutable int32_t mUserRefCount;
 
 
 };
 
-template<class T> 
-class ObjectBaseRef 
+template<class T>
+class ObjectBaseRef
 {
 public:
     ObjectBaseRef() {
@@ -57,14 +61,14 @@
     ObjectBaseRef(const ObjectBaseRef &ref) {
         mRef = ref.get();
         if (mRef) {
-            mRef->incRef();
+            mRef->incSysRef();
         }
     }
 
     ObjectBaseRef(T *ref) {
         mRef = ref;
         if (mRef) {
-            ref->incRef();
+            ref->incSysRef();
         }
     }
 
@@ -77,7 +81,7 @@
             clear();
             mRef = ref;
             if (mRef) {
-                ref->incRef();
+                ref->incSysRef();
             }
         }
     }
@@ -88,7 +92,7 @@
 
     void clear() {
         if (mRef) {
-            mRef->decRef();
+            mRef->decSysRef();
         }
         mRef = NULL;
     }
@@ -97,8 +101,8 @@
         return mRef;
     }
 
-    inline T * operator-> () const { 
-        return mRef;  
+    inline T * operator-> () const {
+        return mRef;
     }
 
 protected:
diff --git a/libs/rs/rsProgramFragment.cpp b/libs/rs/rsProgramFragment.cpp
index 654974f..0adce75 100644
--- a/libs/rs/rsProgramFragment.cpp
+++ b/libs/rs/rsProgramFragment.cpp
@@ -227,7 +227,7 @@
 RsProgramFragment rsi_ProgramFragmentCreate(Context *rsc)
 {
     ProgramFragment *pf = rsc->mStateFragment.mPF;
-    pf->incRef();
+    pf->incUserRef();
     rsc->mStateFragment.mPF = 0;
     return pf;
 }
diff --git a/libs/rs/rsProgramFragmentStore.cpp b/libs/rs/rsProgramFragmentStore.cpp
index 36ec615..3179484 100644
--- a/libs/rs/rsProgramFragmentStore.cpp
+++ b/libs/rs/rsProgramFragmentStore.cpp
@@ -251,7 +251,7 @@
 RsProgramFragmentStore rsi_ProgramFragmentStoreCreate(Context *rsc)
 {
     ProgramFragmentStore *pfs = rsc->mStateFragmentStore.mPFS;
-    pfs->incRef();
+    pfs->incUserRef();
     rsc->mStateFragmentStore.mPFS = 0;
     return pfs;
 }
diff --git a/libs/rs/rsProgramVertex.cpp b/libs/rs/rsProgramVertex.cpp
index dc57d34..a07e166 100644
--- a/libs/rs/rsProgramVertex.cpp
+++ b/libs/rs/rsProgramVertex.cpp
@@ -143,10 +143,10 @@
 
     Matrix m;
     m.loadOrtho(0,w, h,0, -1,1);
-    alloc->subData(RS_PROGRAM_VERTEX_PROJECTION_OFFSET, 16, &m.m[0]);
+    alloc->subData(RS_PROGRAM_VERTEX_PROJECTION_OFFSET, 16, &m.m[0], 16*4);
 
     m.loadIdentity();
-    alloc->subData(RS_PROGRAM_VERTEX_MODELVIEW_OFFSET, 16, &m.m[0]);
+    alloc->subData(RS_PROGRAM_VERTEX_MODELVIEW_OFFSET, 16, &m.m[0], 16*4);
 }
 
 
@@ -162,7 +162,7 @@
 RsProgramVertex rsi_ProgramVertexCreate(Context *rsc)
 {
     ProgramVertex *pv = rsc->mStateVertex.mPV;
-    pv->incRef();
+    pv->incUserRef();
     rsc->mStateVertex.mPV = 0;
     return pv;
 }
diff --git a/libs/rs/rsSampler.cpp b/libs/rs/rsSampler.cpp
index 332d532..3f56faa 100644
--- a/libs/rs/rsSampler.cpp
+++ b/libs/rs/rsSampler.cpp
@@ -143,7 +143,7 @@
                               ss->mWrapS,
                               ss->mWrapT,
                               ss->mWrapR);
-    s->incRef();
+    s->incUserRef();
     return s;
 }
 
diff --git a/libs/rs/rsScriptC.cpp b/libs/rs/rsScriptC.cpp
index 9419829..0c7ac18 100644
--- a/libs/rs/rsScriptC.cpp
+++ b/libs/rs/rsScriptC.cpp
@@ -334,7 +334,7 @@
     ss->runCompiler(rsc);
 
     ScriptC *s = new ScriptC();
-    s->incRef();
+    s->incUserRef();
     s->mAccScript = ss->mAccScript;
     ss->mAccScript = NULL;
     s->mEnviroment = ss->mEnviroment;
diff --git a/libs/rs/rsSimpleMesh.cpp b/libs/rs/rsSimpleMesh.cpp
index 0b745eb..7c73eb9 100644
--- a/libs/rs/rsSimpleMesh.cpp
+++ b/libs/rs/rsSimpleMesh.cpp
@@ -67,7 +67,7 @@
 
     if (mIndexType.get()) {
         glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, mIndexBuffer->getBufferObjectID());
-        glDrawElements(mGLPrimitive, len, GL_UNSIGNED_SHORT, (GLvoid *)(start * 2));
+        glDrawElements(mGLPrimitive, len, GL_UNSIGNED_SHORT, (uint16_t *)(start * 2));
     } else {
         glDrawArrays(mGLPrimitive, start, len);
     }
@@ -91,7 +91,7 @@
 RsSimpleMesh rsi_SimpleMeshCreate(Context *rsc, RsType prim, RsType idx, RsType *vtx, uint32_t vtxCount, uint32_t primType)
 {
     SimpleMesh *sm = new SimpleMesh();
-    sm->incRef();
+    sm->incUserRef();
 
     sm->mIndexType.set((const Type *)idx);
     sm->mPrimitiveType.set((const Type *)prim);
diff --git a/libs/rs/rsTriangleMesh.cpp b/libs/rs/rsTriangleMesh.cpp
index 99f8adb..64bb71b 100644
--- a/libs/rs/rsTriangleMesh.cpp
+++ b/libs/rs/rsTriangleMesh.cpp
@@ -199,7 +199,7 @@
     memcpy(tm->mIndexData, tmc->mIndexData.array(), tm->mIndexDataSize);
     tm->analyzeElement();
 
-    tm->incRef();
+    tm->incUserRef();
     return tm;
 }
 
diff --git a/libs/rs/rsType.cpp b/libs/rs/rsType.cpp
index 5a9090e..1838fa6 100644
--- a/libs/rs/rsType.cpp
+++ b/libs/rs/rsType.cpp
@@ -252,6 +252,7 @@
 
     uint32_t stride = mElement->getSizeBytes();
     if (mGL.mVtx.size) {
+        //LOGE("va vtx %i %x, %i, %p", mGL.mVtx.size, mGL.mVtx.type, stride, (void *)mGL.mVtx.offset);
         glEnableClientState(GL_VERTEX_ARRAY);
         glVertexPointer(mGL.mVtx.size,
                         mGL.mVtx.type,
@@ -260,9 +261,10 @@
     }
 
     if (mGL.mNorm.size) {
+        //LOGE("va norm %i %x, %i, %p", mGL.mNorm.size, mGL.mNorm.type, stride, (void *)mGL.mNorm.offset);
         glEnableClientState(GL_NORMAL_ARRAY);
         rsAssert(mGL.mNorm.size == 3);
-        glNormalPointer(mGL.mNorm.size,
+        glNormalPointer(mGL.mNorm.type,
                         stride,
                         (void *)mGL.mNorm.offset);
     }
@@ -277,6 +279,7 @@
 
     for (uint32_t ct=0; ct < RS_MAX_TEXTURE; ct++) {
         if (mGL.mTex[ct].size) {
+            //LOGE("va tex%i %i %x, %i, %p", ct, mGL.mTex[ct].size, mGL.mTex[ct].type, stride, (void *)mGL.mTex[ct].offset);
             glClientActiveTexture(GL_TEXTURE0 + ct);
             glEnableClientState(GL_TEXTURE_COORD_ARRAY);
             glTexCoordPointer(mGL.mTex[ct].size,
@@ -361,7 +364,7 @@
     TypeState * stc = &rsc->mStateType;
 
     Type * st = new Type();
-    st->incRef();
+    st->incUserRef();
     st->setDimX(stc->mX);
     st->setDimY(stc->mY);
     st->setDimZ(stc->mZ);
diff --git a/libs/rs/rsg_generator.c b/libs/rs/rsg_generator.c
index e3f816f..74ba248 100644
--- a/libs/rs/rsg_generator.c
+++ b/libs/rs/rsg_generator.c
@@ -141,6 +141,7 @@
     fprintf(f, "\n");
     fprintf(f, "using namespace android;\n");
     fprintf(f, "using namespace android::renderscript;\n");
+    fprintf(f, "#include \"rsHandcode.h\"\n");
     fprintf(f, "\n");
 
     for(ct=0; ct < apiCount; ct++) {
@@ -149,30 +150,39 @@
 
         printFuncDecl(f, api, "rs", 0);
         fprintf(f, "\n{\n");
-        fprintf(f, "    ThreadIO *io = &((Context *)rsc)->mIO;\n");
-        //fprintf(f, "    LOGE(\"add command %s\\n\");\n", api->name);
-        fprintf(f, "    RS_CMD_%s *cmd = static_cast<RS_CMD_%s *>(io->mToCore.reserve(sizeof(RS_CMD_%s)));\n", api->name, api->name, api->name);
-        fprintf(f, "    uint32_t size = sizeof(RS_CMD_%s);\n", api->name);
+        if (api->handcodeApi) {
+            fprintf(f, "    rsHCAPI_%s(rsc", api->name);
+            for(ct2=0; ct2 < api->paramCount; ct2++) {
+                const VarType *vt = &api->params[ct2];
+                fprintf(f, ", %s", vt->name);
+            }
+            fprintf(f, ");\n");
+        } else {
+            fprintf(f, "    ThreadIO *io = &((Context *)rsc)->mIO;\n");
+            //fprintf(f, "    LOGE(\"add command %s\\n\");\n", api->name);
+            fprintf(f, "    RS_CMD_%s *cmd = static_cast<RS_CMD_%s *>(io->mToCore.reserve(sizeof(RS_CMD_%s)));\n", api->name, api->name, api->name);
+            fprintf(f, "    uint32_t size = sizeof(RS_CMD_%s);\n", api->name);
 
-        for(ct2=0; ct2 < api->paramCount; ct2++) {
-            const VarType *vt = &api->params[ct2];
-            needFlush += vt->ptrLevel;
-            fprintf(f, "    cmd->%s = %s;\n", vt->name, vt->name);
-        }
-        if (api->ret.typeName[0]) {
-            needFlush = 1;
-        }
+            for(ct2=0; ct2 < api->paramCount; ct2++) {
+                const VarType *vt = &api->params[ct2];
+                needFlush += vt->ptrLevel;
+                fprintf(f, "    cmd->%s = %s;\n", vt->name, vt->name);
+            }
+            if (api->ret.typeName[0]) {
+                needFlush = 1;
+            }
 
-        fprintf(f, "    io->mToCore.commit");
-        if (needFlush) {
-            fprintf(f, "Sync");
-        }
-        fprintf(f, "(RS_CMD_ID_%s, size);\n", api->name);
+            fprintf(f, "    io->mToCore.commit");
+            if (needFlush) {
+                fprintf(f, "Sync");
+            }
+            fprintf(f, "(RS_CMD_ID_%s, size);\n", api->name);
 
-        if (api->ret.typeName[0]) {
-            fprintf(f, "    return reinterpret_cast<");
-            printVarType(f, &api->ret);
-            fprintf(f, ">(io->mToCoreRet);\n");
+            if (api->ret.typeName[0]) {
+                fprintf(f, "    return reinterpret_cast<");
+                printVarType(f, &api->ret);
+                fprintf(f, ">(io->mToCoreRet);\n");
+            }
         }
         fprintf(f, "};\n\n");
     }
@@ -191,6 +201,7 @@
     fprintf(f, "\n");
     fprintf(f, "namespace android {\n");
     fprintf(f, "namespace renderscript {\n");
+    fprintf(f, "#include \"rsHandcode.h\"\n");
     fprintf(f, "\n");
 
     for(ct=0; ct < apiCount; ct++) {
@@ -198,20 +209,22 @@
 
         fprintf(f, "void rsp_%s(Context *con, const void *vp)\n", api->name);
         fprintf(f, "{\n");
-        //fprintf(f, "    LOGE(\"play command %s\\n\");\n", api->name);
-        fprintf(f, "    const RS_CMD_%s *cmd = static_cast<const RS_CMD_%s *>(vp);\n", api->name, api->name);
-        fprintf(f, "    ");
-        if (api->ret.typeName[0]) {
-            fprintf(f, "con->mIO.mToCoreRet = (intptr_t)");
+        if (api->handcodePlay) {
+            fprintf(f, "    rsHCPLAY_%s(con, vp);\n", api->name);
+        } else {
+            //fprintf(f, "    LOGE(\"play command %s\\n\");\n", api->name);
+            fprintf(f, "    const RS_CMD_%s *cmd = static_cast<const RS_CMD_%s *>(vp);\n", api->name, api->name);
+            fprintf(f, "    ");
+            if (api->ret.typeName[0]) {
+                fprintf(f, "con->mIO.mToCoreRet = (intptr_t)");
+            }
+            fprintf(f, "rsi_%s(con", api->name);
+            for(ct2=0; ct2 < api->paramCount; ct2++) {
+                const VarType *vt = &api->params[ct2];
+                fprintf(f, ",\n           cmd->%s", vt->name);
+            }
+            fprintf(f, ");\n");
         }
-        fprintf(f, "rsi_%s(con", api->name);
-        for(ct2=0; ct2 < api->paramCount; ct2++) {
-            const VarType *vt = &api->params[ct2];
-            fprintf(f, ",");
-            fprintf(f, "\n           cmd->%s", vt->name);
-        }
-        fprintf(f, ");\n");
-
         fprintf(f, "};\n\n");
     }
 
diff --git a/libs/rs/spec.h b/libs/rs/spec.h
index ba802f7..82650a7 100644
--- a/libs/rs/spec.h
+++ b/libs/rs/spec.h
@@ -24,6 +24,8 @@
 typedef struct {
   char name[256];
   int sync;
+  int handcodeApi;
+  int handcodePlay;
   int paramCount;
   VarType ret;
   VarType params[16];
diff --git a/libs/rs/spec.l b/libs/rs/spec.l
index 62fcb63..d81d47e 100644
--- a/libs/rs/spec.l
+++ b/libs/rs/spec.l
@@ -47,6 +47,14 @@
     apis[apiCount].sync = 1;
     }
 
+<api_entry2>"handcodeApi" {
+    apis[apiCount].handcodeApi = 1;
+    }
+
+<api_entry2>"handcodePlay" {
+    apis[apiCount].handcodePlay = 1;
+    }
+
 <api_entry2>"ret" {
     currType = &apis[apiCount].ret;
     typeNextState = api_entry2;