diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu
index 452b379f..651add14 100755
--- a/Problem Sets/Problem Set 1/student_func.cu	
+++ b/Problem Sets/Problem Set 1/student_func.cu	
@@ -2,11 +2,11 @@
 // Color to Greyscale Conversion
 
 //A common way to represent color images is known as RGBA - the color
-//is specified by how much Red, Grean and Blue is in it.
-//The 'A' stands for Alpha and is used for transparency, it will be
+//is specified by how much Red, Green, and Blue is in it.
+//The 'A' stands for Alpha and is used for transparency; it will be
 //ignored in this homework.
 
-//Each channel Red, Blue, Green and Alpha is represented by one byte.
+//Each channel Red, Blue, Green, and Alpha is represented by one byte.
 //Since we are using one byte for each color there are 256 different
 //possible values for each color.  This means we use 4 bytes per pixel.
 
@@ -32,6 +32,7 @@
 //so that the entire image is processed.
 
 #include "utils.h"
+#include <stdio.h>
 
 __global__
 void rgba_to_greyscale(const uchar4* const rgbaImage,
@@ -48,8 +49,16 @@ void rgba_to_greyscale(const uchar4* const rgbaImage,
   //Note: We will be ignoring the alpha channel for this conversion
 
   //First create a mapping from the 2D block and grid locations
-  //to an absolute 2D location in the image, then use that to
+  //to an absolute 2D location in the image, they use that to
   //calculate a 1D offset
+  int y = threadIdx.y+ blockIdx.y* blockDim.y;
+  int x = threadIdx.x+ blockIdx.x* blockDim.x;
+  if (y < numCols && x < numRows) {
+  	int index = numRows*y +x;
+  uchar4 color = rgbaImage[index];
+  unsigned char grey = (unsigned char)(0.299f*color.x+ 0.587f*color.y + 0.114f*color.z);
+  greyImage[index] = grey;
+  }
 }
 
 void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
@@ -57,10 +66,15 @@ void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_r
 {
   //You must fill in the correct sizes for the blockSize and gridSize
   //currently only one block with one thread is being launched
-  const dim3 blockSize(1, 1, 1);  //TODO
-  const dim3 gridSize( 1, 1, 1);  //TODO
+  
+  int   blockWidth = 32;
+  
+  const dim3 blockSize(blockWidth, blockWidth, 1);
+  int   blocksX = numRows/blockWidth+1;
+  int   blocksY = numCols/blockWidth+1; //TODO
+  const dim3 gridSize( blocksX, blocksY, 1);  //TODO
   rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
   
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
 }
+
diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu
index 825e412b..0a71e58c 100755
--- a/Problem Sets/Problem Set 2/student_func.cu	
+++ b/Problem Sets/Problem Set 2/student_func.cu	
@@ -1,3 +1,4 @@
+#include <stdio.h>
 // Homework 2
 // Image Blurring
 //
@@ -102,6 +103,7 @@
 
 #include "utils.h"
 
+#include <stdio.h>
 __global__
 void gaussian_blur(const unsigned char* const inputChannel,
                    unsigned char* const outputChannel,
@@ -117,11 +119,33 @@ void gaussian_blur(const unsigned char* const inputChannel,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+ 
+    
+    const int2 p = make_int2( blockIdx.x * blockDim.x + threadIdx.x,
+                               blockIdx.y * blockDim.y + threadIdx.y);
+    const int m = p.y * numCols + p.x;
+    
+    if(p.x >= numCols || p.y >= numRows)
+         return;
+    
+    float color = 0.0f;
+    
+    for(int f_y = 0; f_y < filterWidth; f_y++) {
+        for(int f_x = 0; f_x < filterWidth; f_x++) {
+   
+            int c_x = p.x + f_x - filterWidth/2;
+            int c_y = p.y + f_y - filterWidth/2;
+            c_x = min(max(c_x, 0), numCols - 1);
+            c_y = min(max(c_y, 0), numRows - 1);
+            float filter_value = filter[f_y*filterWidth + f_x];
+            color += filter_value*static_cast<float>(inputChannel[c_y*numCols + c_x]);
+            
+        }
+    }
+    
+    outputChannel[m] = color;
+    
+    
   
   // NOTE: If a thread's absolute position 2D position is within the image, but some of
   // its neighbors are outside the image, then you will need to be extra careful. Instead
@@ -147,11 +171,16 @@ void separateChannels(const uchar4* const inputImageRGBA,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+
+  const int2 p = make_int2( blockIdx.x * blockDim.x + threadIdx.x,
+                             blockIdx.y * blockDim.y + threadIdx.y);
+  const int m = p.y * numCols + p.x;
+    
+  if(p.x >= numCols || p.y >= numRows)
+      return;
+  redChannel[m]   = inputImageRGBA[m].x;
+  greenChannel[m] = inputImageRGBA[m].y;
+  blueChannel[m]  = inputImageRGBA[m].z;
 }
 
 //This kernel takes in three color channels and recombines them
@@ -205,11 +234,12 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
   //be sure to use checkCudaErrors like the above examples to
   //be able to tell if anything goes wrong
   //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
-
+  checkCudaErrors(cudaMalloc(&d_filter, sizeof( float) * filterWidth * filterWidth));
   //TODO:
   //Copy the filter on the host (h_filter) to the memory you just allocated
   //on the GPU.  cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
   //Remember to use checkCudaErrors!
+  checkCudaErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));
 
 }
 
@@ -221,21 +251,50 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                         const int filterWidth)
 {
   //TODO: Set reasonable block size (i.e., number of threads per block)
-  const dim3 blockSize;
+  const dim3 blockSize(32, 32);
 
   //TODO:
   //Compute correct grid size (i.e., number of blocks per kernel launch)
   //from the image size and and block size.
-  const dim3 gridSize;
+  const dim3 gridSize(numCols/blockSize.x + 1, numRows/blockSize.y + 1);
+ 
 
   //TODO: Launch a kernel for separating the RGBA image into different color channels
-
-  // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
-  // launching your kernel to make sure that you didn't make any mistakes.
+  separateChannels<<<gridSize, blockSize>>>(d_inputImageRGBA,
+                                            numRows,
+                                            numCols,
+                                            d_red,
+                                            d_green,
+                                            d_blue);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
 
   //TODO: Call your convolution kernel here 3 times, once for each color channel.
-
+  gaussian_blur<<<gridSize, blockSize>>>(
+      d_red,
+      d_redBlurred,
+      numRows,
+      numCols,
+      d_filter,
+      filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
+  gaussian_blur<<<gridSize, blockSize>>>(
+      d_blue,
+      d_blueBlurred,
+      numRows,
+      numCols,
+      d_filter,
+      filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
+  gaussian_blur<<<gridSize, blockSize>>>(
+      d_green,
+      d_greenBlurred,
+      numRows,
+      numCols,
+      d_filter,
+      filterWidth);
   // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
@@ -251,7 +310,6 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                                              numRows,
                                              numCols);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
 }
 
 
@@ -262,3 +320,4 @@ void cleanup() {
   checkCudaErrors(cudaFree(d_green));
   checkCudaErrors(cudaFree(d_blue));
 }
+
diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu
index 26f00a74..dd76ff71 100755
--- a/Problem Sets/Problem Set 3/student_func.cu	
+++ b/Problem Sets/Problem Set 3/student_func.cu	
@@ -79,8 +79,144 @@
 
 */
 
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+
 #include "utils.h"
 
+__global__
+void histogram_kernel(unsigned int* d_bins, const float* d_in, const int bin_count, const float lum_min, const float lum_max, const int size) {  
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    if(mid >= size)
+        return;
+    float lum_range = lum_max - lum_min;
+    int bin = ((d_in[mid]-lum_min) / lum_range) * bin_count;
+    
+    atomicAdd(&d_bins[bin], 1);
+}
+
+__global__ 
+void scan_kernel(unsigned int* d_bins, int size) {
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    if(mid >= size)
+        return;
+    
+    for(int s = 1; s <= size; s *= 2) {
+          int spot = mid - s; 
+         
+          unsigned int val = 0;
+          if(spot >= 0)
+              val = d_bins[spot];
+          __syncthreads();
+          if(spot >= 0)
+              d_bins[mid] += val;
+          __syncthreads();
+
+    }
+}
+// calculate reduce max or min and stick the value in d_answer.
+__global__
+void reduce_minmax_kernel(const float* const d_in, float* d_out, const size_t size, int minmax) {
+    extern __shared__ float shared[];
+    
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    int tid = threadIdx.x; 
+    
+    // we have 1 thread per block, so copying the entire block should work fine
+    if(mid < size) {
+        shared[tid] = d_in[mid];
+    } else {
+        if(minmax == 0)
+            shared[tid] = FLT_MAX;
+        else
+            shared[tid] = -FLT_MAX;
+    }
+    
+    // wait for all threads to copy the memory
+    __syncthreads();
+    
+    // don't do any thing with memory if we happen to be far off ( I don't know how this works with
+    // sync threads so I moved it after that point )
+    if(mid >= size) {   
+        if(tid == 0) {
+            if(minmax == 0) 
+                d_out[blockIdx.x] = FLT_MAX;
+            else
+                d_out[blockIdx.x] = -FLT_MAX;
+
+        }
+        return;
+    }
+       
+    for(unsigned int s = blockDim.x/2; s > 0; s /= 2) {
+        if(tid < s) {
+            if(minmax == 0) {
+                shared[tid] = min(shared[tid], shared[tid+s]);
+            } else {
+                shared[tid] = max(shared[tid], shared[tid+s]);
+            }
+        }
+        
+        __syncthreads();
+    }
+    
+    if(tid == 0) {
+        d_out[blockIdx.x] = shared[0];
+    }
+}
+
+int get_max_size(int n, int d) {
+    return (int)ceil( (float)n/(float)d ) + 1;
+}
+
+float reduce_minmax(const float* const d_in, const size_t size, int minmax) {
+    int BLOCK_SIZE = 32;
+    // we need to keep reducing until we get to the amount that we consider 
+    // having the entire thing fit into one block size
+    size_t curr_size = size;
+    float* d_curr_in;
+    
+    checkCudaErrors(cudaMalloc(&d_curr_in, sizeof(float) * size));    
+    checkCudaErrors(cudaMemcpy(d_curr_in, d_in, sizeof(float) * size, cudaMemcpyDeviceToDevice));
+
+
+    float* d_curr_out;
+    
+    dim3 thread_dim(BLOCK_SIZE);
+    const int shared_mem_size = sizeof(float)*BLOCK_SIZE;
+    
+    while(1) {
+        checkCudaErrors(cudaMalloc(&d_curr_out, sizeof(float) * get_max_size(curr_size, BLOCK_SIZE)));
+        
+        dim3 block_dim(get_max_size(size, BLOCK_SIZE));
+        reduce_minmax_kernel<<<block_dim, thread_dim, shared_mem_size>>>(
+            d_curr_in,
+            d_curr_out,
+            curr_size,
+            minmax
+        );
+        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+            
+        // move the current input to the output, and clear the last input if necessary
+        checkCudaErrors(cudaFree(d_curr_in));
+        d_curr_in = d_curr_out;
+        
+        if(curr_size <  BLOCK_SIZE) 
+            break;
+        
+        curr_size = get_max_size(curr_size, BLOCK_SIZE);
+    }
+    
+    // theoretically we should be 
+    float h_out;
+    cudaMemcpy(&h_out, d_curr_out, sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(d_curr_out);
+    return h_out;
+}
+
 void your_histogram_and_prefixsum(const float* const d_logLuminance,
                                   unsigned int* const d_cdf,
                                   float &min_logLum,
@@ -89,6 +225,45 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance,
                                   const size_t numCols,
                                   const size_t numBins)
 {
+    const size_t size = numRows*numCols;
+    min_logLum = reduce_minmax(d_logLuminance, size, 0);
+    max_logLum = reduce_minmax(d_logLuminance, size, 1);
+    
+    printf("got min of %f\n", min_logLum);
+    printf("got max of %f\n", max_logLum);
+    printf("numBins %d\n", numBins);
+    
+    unsigned int* d_bins;
+    size_t histo_size = sizeof(unsigned int)*numBins;
+
+    checkCudaErrors(cudaMalloc(&d_bins, histo_size));    
+    checkCudaErrors(cudaMemset(d_bins, 0, histo_size));  
+    dim3 thread_dim(1024);
+    dim3 hist_block_dim(get_max_size(size, thread_dim.x));
+    histogram_kernel<<<hist_block_dim, thread_dim>>>(d_bins, d_logLuminance, numBins, min_logLum, max_logLum, size);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    unsigned int h_out[100];
+    cudaMemcpy(&h_out, d_bins, sizeof(unsigned int)*100, cudaMemcpyDeviceToHost);
+    for(int i = 0; i < 100; i++)
+        printf("hist out %d\n", h_out[i]);
+    
+    dim3 scan_block_dim(get_max_size(numBins, thread_dim.x));
+
+    scan_kernel<<<scan_block_dim, thread_dim>>>(d_bins, numBins);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
+    cudaMemcpy(&h_out, d_bins, sizeof(unsigned int)*100, cudaMemcpyDeviceToHost);
+    for(int i = 0; i < 100; i++)
+        printf("cdf out %d\n", h_out[i]);
+    
+
+    cudaMemcpy(d_cdf, d_bins, histo_size, cudaMemcpyDeviceToDevice);
+
+    
+    checkCudaErrors(cudaFree(d_bins));
+     
+    
   //TODO
   /*Here are the steps you need to implement
     1) find the minimum and maximum value in the input logLuminance channel
@@ -99,6 +274,4 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance,
     4) Perform an exclusive scan (prefix sum) on the histogram to get
        the cumulative distribution of luminance values (this should go in the
        incoming d_cdf pointer which already has been allocated for you)       */
-
-
 }
diff --git a/Problem Sets/Problem Set 4/Makefile b/Problem Sets/Problem Set 4/Makefile
index 01a3efc0..c4d094e7 100755
--- a/Problem Sets/Problem Set 4/Makefile	
+++ b/Problem Sets/Problem Set 4/Makefile	
@@ -1,5 +1,4 @@
-NVCC=/usr/local/cuda-5.0/bin/nvcc
-#NVCC=nvcc
+NVCC=nvcc
 
 ###################################
 # These are the default install   #
diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu
index 347d7b6e..f58d314f 100755
--- a/Problem Sets/Problem Set 4/student_func.cu	
+++ b/Problem Sets/Problem Set 4/student_func.cu	
@@ -1,8 +1,12 @@
 //Udacity HW 4
 //Radix Sorting
 
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+
 #include "utils.h"
-#include <thrust/host_vector.h>
+
 
 /* Red Eye Removal
    ===============
@@ -42,13 +46,303 @@
 
  */
 
+__global__
+void histogram_kernel(unsigned int pass,
+                      unsigned int * d_bins,
+                      unsigned int* const d_input, 
+                      const int size) {  
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    if(mid >= size)
+        return;
+    unsigned int one = 1;
+    int bin = ((d_input[mid] & (one<<pass)) == (one<<pass)) ? 1 : 0;
+    if(bin) 
+         atomicAdd(&d_bins[1], 1);
+    else
+         atomicAdd(&d_bins[0], 1);
+}
+
+// we will run 1 exclusive scan, but then when we 
+// do the move, for zero vals, we iwll take mid - val of scan there
+__global__
+void exclusive_scan_kernel(unsigned int pass,
+                    unsigned int const * d_inputVals,
+                    unsigned int * d_output,
+                    const int size,
+                    unsigned int base,
+                    unsigned int threadSize) {
+    int mid = threadIdx.x + threadSize * base;
+    int block = threadSize*base;
+        unsigned int one = 1;
+
+    if(mid >= size)
+        return;
+      unsigned int val = 0;
+    if(mid > 0)
+        val = ((d_inputVals[mid-1] & (one<<pass))  == (one<<pass)) ? 1 : 0;
+    else
+        val = 0;
+
+    d_output[mid] = val;
+    
+    __syncthreads();
+    
+    for(int s = 1; s <= threadSize; s *= 2) {
+        int spot = mid - s; 
+         
+        if(spot >= 0 && spot >=  threadSize*base)
+             val = d_output[spot];
+        __syncthreads();
+        if(spot >= 0 && spot >= threadSize*base)
+            d_output[mid] += val;
+        __syncthreads();
+    }
+    if(base > 0)
+        d_output[mid] += d_output[base*threadSize - 1];
+    
+}
+
+__global__
+void test_kernel(unsigned int pass,
+                 unsigned int * d_output,
+                  size_t numElems)
+{
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned int one=1;
+    unsigned int val = (unsigned int)mid;
+    if(mid < numElems) {
+        d_output[mid] = (val & (one<<pass)) == (one<<pass) ? 1 : 0; 
+    }
+}
+
+void test(unsigned int pass) { 
+    int numtest = 24;
+    unsigned int* d_out;
+    
+    checkCudaErrors(cudaMalloc(&d_out, sizeof(unsigned int)*numtest));
+    
+    unsigned int h_out[numtest];
+    
+    test_kernel<<<dim3(1), dim3(numtest)>>>(pass, d_out, numtest);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+       
+    checkCudaErrors(cudaMemcpy(&h_out, d_out, numtest*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    
+    for(int i = 0; i< numtest; i++) {
+        printf("%d: %d, ", i, h_out[i]);
+    }
+    printf("\n");
+    
+    checkCudaErrors(cudaFree(d_out));
+}
+
+__global__
+void move_kernel(
+    unsigned int pass,
+    unsigned int* const d_inputVals,
+    unsigned int* const d_inputPos,
+    unsigned int* d_outputVals,
+    unsigned int* d_outputPos,
+    unsigned int* d_outputMove,
+    unsigned int* const d_scanned,
+    unsigned int  one_pos,
+    const size_t numElems) {
+    
+    int mid = threadIdx.x + blockDim.x * blockIdx.x;
+    if(mid >= numElems)
+        return;
+    
+    unsigned int scan=0;
+    unsigned int base=0;
+    unsigned int one= 1;
+    if( ( d_inputVals[mid] & (one<<pass)) == (1<<pass)) {
+        scan = d_scanned[mid]; 
+        base = one_pos;
+    } else {
+        scan = (mid) - d_scanned[mid];
+        base = 0;
+    }
+    
+    d_outputMove[mid] = base+scan;
+    d_outputPos[base+scan]  = d_inputPos[mid];//d_inputPos[0];
+    d_outputVals[base+scan] = d_inputVals[mid];//base+scan;//d_inputVals[0];
+    
+}
+   
+int debug = 1;
+void debug_device_array(char* name, int l, unsigned int * d_arr, int numElems) {
+    
+   
+    if(!debug)
+        return;
+    unsigned int h_arr[l];
+    checkCudaErrors(cudaMemcpy(&h_arr, d_arr, l*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    printf(name);
+    printf(" ");
+    for(int i=0; i < l; i++) {
+        printf("%d ", h_arr[i]);
+    }
+    printf("\n");
+    unsigned int max = 0;
+    unsigned int min = 1000000;
+    unsigned int h_arr2[numElems];
+    checkCudaErrors(cudaMemcpy(&h_arr2, d_arr, numElems*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    for(int i = 0; i < numElems; i++) {
+        if(h_arr2[i] < min)
+            min = h_arr2[i];
+         if(h_arr2[i] > max)
+            max = h_arr2[i];
+    }
+    printf("max %d min %d\n", max, min);
+}
+
+void verify_scan(unsigned int * d_arr, unsigned int * d_scanned, int numElems, int pass) {
+    unsigned int h_arr[3000];
+    unsigned int one  =1;
+    unsigned int h_scanned[3000];
+    checkCudaErrors(cudaMemcpy(&h_arr, d_arr, 3000*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(&h_scanned, d_scanned, 3000*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    unsigned int acc = 0;
+    for(int i = 0; i < 3000; i++) {
+        if(acc != h_scanned[i]) {
+               printf("wrong at %d %d != %d\n", i, acc, h_scanned[i]);
+        }
+        acc+= ((h_arr[i] & (one<<pass)) == (one<<pass)) ? 1 : 0;
+    }
+}
+/*
+void verify_sort(unsigned int * d_sorted, int numElems, int pass) {
+    unsigned int h_sorted[3000];
+    checkCudaErrors(cudaMemcpy(&h_sorted, d_sorted, 3000*sizeof(unsigned int), cudaMemcpyDeviceToHost));
+    unsigned int last = h_scanned[0];
+    for(int i = 1; i < 3000; i++) {
+        if(acc != h_scanned[i]) {
+               printf("wrong at %d %d != %d\n", i, acc, h_scanned[i]);
+        }
+        acc+= ((h_arr[i] & (1<<pass) == (1<<pass)) ? 1 : 0);
+    }
+}*/
+// gives you a good max size for n/d
+int get_max_size(int n, int d) {
+    return (int)ceil( (float)n/(float)d ) + 1;
+}
 
 void your_sort(unsigned int* const d_inputVals,
                unsigned int* const d_inputPos,
                unsigned int* const d_outputVals,
                unsigned int* const d_outputPos,
                const size_t numElems)
-{ 
-  //TODO
-  //PUT YOUR SORT HERE
+{  
+    /*for(unsigned int i = 0; i < 32; i++) {
+        printf("testing %d \n", i);
+        test(i);
+    }*/
+    
+    //return;
+    unsigned int* d_bins;
+    unsigned int  h_bins[2];
+    unsigned int* d_scanned;
+    unsigned int* d_moved;
+    const size_t histo_size = 2*sizeof(unsigned int);
+    const size_t arr_size   = numElems*sizeof(unsigned int);
+    
+    checkCudaErrors(cudaMalloc(&d_bins, histo_size));
+    checkCudaErrors(cudaMalloc(&d_scanned, arr_size));
+    checkCudaErrors(cudaMalloc(&d_moved, arr_size));
+    
+    
+    // just keep thread dim at 1024
+    dim3 thread_dim(1024 );
+    dim3 hist_block_dim(get_max_size(numElems, thread_dim.x));
+    
+    // get number of elements
+    //printf("numElems %d\n", numElems);
+    
+    debug_device_array("input", 100, d_inputVals, numElems);
+        
+    
+    for(unsigned int pass = 0; pass < 32; pass++) {
+        unsigned int one = 1;
+        /*if((one<<pass) <= 0) {
+            printf("breaking at pass %d ", pass);
+            break;
+        }*/
+        checkCudaErrors(cudaMemset(d_bins, 0, histo_size));
+        checkCudaErrors(cudaMemset(d_scanned, 0, arr_size));
+        checkCudaErrors(cudaMemset(d_outputVals, 0, arr_size));
+        checkCudaErrors(cudaMemset(d_outputPos, 0, arr_size));
+        
+        histogram_kernel<<<hist_block_dim, thread_dim>>>(pass, d_bins, d_inputVals, numElems);
+        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+       
+        // copy the histogram data to host
+        checkCudaErrors(cudaMemcpy(&h_bins, d_bins, histo_size, cudaMemcpyDeviceToHost));
+        
+        printf("hey guys %d %d %d %d %d \n", h_bins[0], h_bins[1], h_bins[0]+h_bins[1], numElems, (one<<pass));
+     
+        // now we have 0, and 1 start position.. 
+        // get the scan of 1's
+        
+        for(int i = 0; i < get_max_size(numElems, thread_dim.x); i++) {
+            exclusive_scan_kernel<<<dim3(1), thread_dim>>>(
+                   pass,
+                   d_inputVals,
+                   d_scanned,
+                   numElems,
+                   i, 
+                   thread_dim.x
+            );
+            cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+        
+        }        
+        //printf("made it past scanned\n");
+        
+        //debug_device_array("input", 100, d_inputVals, numElems);
+        //debug_device_array("scanned", 100, d_scanned, numElems);
+        //verify_scan(d_inputVals, d_scanned, numElems, pass);
+
+        // calculate the move positions
+        move_kernel<<<hist_block_dim, thread_dim>>>(
+            pass,
+            d_inputVals,
+            d_inputPos,
+            d_outputVals,
+            d_outputPos,
+            d_moved,
+            d_scanned,
+            h_bins[0],
+            numElems
+        );
+        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+        
+        //debug_device_array("move", 100, d_moved, numElems);
+        //debug_device_array("output vals ", 100, d_outputVals, numElems);
+        //debug_device_array("output pos  ", 100, d_outputPos, numElems);
+        
+       
+       // printf("made it past move calculation \n");
+        
+        //finall
+         // copy the histogram data to input
+        checkCudaErrors(cudaMemcpy(d_inputVals, d_outputVals, arr_size, cudaMemcpyDeviceToDevice));
+        checkCudaErrors(cudaMemcpy(d_inputPos, d_outputPos, arr_size, cudaMemcpyDeviceToDevice));
+
+        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+       
+        
+        
+    }
+    //printf("Made to end\n");
+    //      debug_device_array("output vals ", 100000, d_outputVals, numElems);
+    //    debug_device_array("output pos  ", 100, d_outputPos, numElems);
+        
+    
+    checkCudaErrors(cudaFree(d_moved));
+    checkCudaErrors(cudaFree(d_scanned));
+    checkCudaErrors(cudaFree(d_bins));
 }
+
diff --git a/Problem Sets/Problem Set 6/student_func.cu b/Problem Sets/Problem Set 6/student_func.cu
index 1f42ce05..5b56cef4 100755
--- a/Problem Sets/Problem Set 6/student_func.cu	
+++ b/Problem Sets/Problem Set 6/student_func.cu	
@@ -61,53 +61,402 @@
 
     In this assignment we will do 800 iterations.
    */
+   
+#include "utils.h"
+#include <algorithm>
+
+// get 2d position from block
+__device__
+int2 get2dPos() { 
+	return make_int2(
+		blockIdx.x * blockDim.x + threadIdx.x,
+       	blockIdx.y * blockDim.y + threadIdx.y
+	);
+}
 
+// check whether a a value is within the image bounds
+__device__
+bool withinBounds(const int x, const int y, const size_t numRowsSource, const size_t numColsSource) {
+    return ((x < numColsSource) && (y < numRowsSource));
+}
 
+__device__
+bool masked(uchar4 val) {
+	return (val.x != 255 || val.y != 255 || val.z != 255);
+}
 
-#include "utils.h"
-#include <thrust/host_vector.h>
+__device__
+int getm(int x, int y, size_t numColsSource) {
+	return y*numColsSource + x;
+}
+
+__global__
+void maskPredicateKernel(
+	const uchar4* const d_sourceImg,
+	int* d_borderPredicate,
+	int* d_interiorPredicate,
+	const size_t numRowsSource,
+	const size_t numColsSource) {
+	
+    const int2 p = get2dPos();
+	const int  m = getm(p.x, p.y, numColsSource);
+    
+    if(!withinBounds(p.x, p.y, numRowsSource, numColsSource))
+         return;
+
+ 	// run through each pixel and determine if its 
+	// on the border, or if its on the interior border
+	
+	if(masked(d_sourceImg[m])) {
+		int inbounds = 0;
+		int interior = 0;
+
+		// count how many of our neighbors are masked,
+		// and how many neighbors we have
+		if (withinBounds(p.x, p.y+1, numRowsSource, numColsSource)) {
+			inbounds++;
+			if(masked(d_sourceImg[getm(p.x, p.y+1, numColsSource)]))
+				interior++;		
+	
+		}
+		if (withinBounds(p.x, p.y-1, numRowsSource, numColsSource)) {
+			inbounds++;
+			if(masked(d_sourceImg[getm(p.x, p.y-1, numColsSource)]))
+				interior++;		
+	
+		}
+		if (withinBounds(p.x+1, p.y, numRowsSource, numColsSource)) {
+			inbounds++;
+			if(masked(d_sourceImg[getm(p.x+1, p.y, numColsSource)]))
+				interior++;		
+		}
+		if (withinBounds(p.x-1, p.y, numRowsSource, numColsSource)) {
+			inbounds++;
+			if(masked(d_sourceImg[getm(p.x-1, p.y, numColsSource)]))
+				interior++;		
+		}
+
+		// clear out the values so we don't
+		// have to memset this destination stuff
+		d_interiorPredicate[m] = 0;
+		d_borderPredicate[m]   = 0;
+	
+		// if all our neighbors are masked, then its interior
+		if(inbounds == interior) {
+			d_interiorPredicate[m] = 1;
+		} else if (interior > 0) {
+			d_borderPredicate[m] = 1;
+		}
+	}
+}
+
+__global__
+void separateChannelsKernel(
+	const uchar4* const inputImageRGBA,
+	float* const redChannel,
+	float* const greenChannel,
+	float* const blueChannel,
+	size_t numRows,
+	size_t numCols)
+{
+    const int2 p = get2dPos();
+	const int  m = getm(p.x, p.y, numCols);
+    
+    if(!withinBounds(p.x, p.y, numRows, numCols))
+         return;
+
+  	redChannel[m]   = (float)inputImageRGBA[m].x;
+  	greenChannel[m] = (float)inputImageRGBA[m].y;
+  	blueChannel[m]  = (float)inputImageRGBA[m].z;
+}
+
+__global__
+void recombineChannelsKernel(
+	uchar4* outputImageRGBA,
+	float* const redChannel,
+	float* const greenChannel,
+	float* const blueChannel,
+	size_t numRows,
+	size_t numCols)
+{
+    const int2 p = get2dPos();
+	const int  m = getm(p.x, p.y, numCols);
+    
+    if(!withinBounds(p.x, p.y, numRows, numCols))
+         return;
+	
+	outputImageRGBA[m].x = (char)redChannel[m];
+	outputImageRGBA[m].y = (char)greenChannel[m];
+	outputImageRGBA[m].z = (char)blueChannel[m];
+}
+
+__global__
+void jacobiKernel(
+	float* d_in,
+	float* d_out,
+	const int* d_borderPredicate,
+	const int* d_interiorPredicate,
+	float* d_source,
+	float* d_dest,
+	size_t numRows,
+	size_t numCols)
+{
+    const int2 p = get2dPos();
+	const int  m = getm(p.x, p.y, numCols);
+    
+    if(!withinBounds(p.x, p.y, numRows, numCols))
+         return;
+
+	// calculate these values as indicated in the videos
+
+	int lm;
+	if(d_interiorPredicate[m]==1) {
+		float a = 0.f, b=0.f, c=0.0f, d=0.f;
+		float sourceVal = d_source[m];
+
+		if(withinBounds(p.x, p.y+1, numRows, numCols)) {
+			d++;
+			lm = getm(p.x, p.y+1, numCols);
+			if(d_interiorPredicate[lm]==1) {
+				a += d_in[lm];
+			} else if(d_borderPredicate[lm]==1) {
+				b += d_dest[lm];
+			}
+			c += (sourceVal-d_source[lm]);
+		}
+		
+		if(withinBounds(p.x, p.y-1, numRows, numCols)) {
+			d++;
+			lm = getm(p.x, p.y-1, numCols);
+			if(d_interiorPredicate[lm]==1) {
+				a += d_in[lm];
+			} else if(d_borderPredicate[lm]==1) {
+				b += d_dest[lm];
+			}
+			c += (sourceVal-d_source[lm]);
+		}
+		
+		if(withinBounds(p.x+1, p.y, numRows, numCols)) {
+			d++;
+			lm = getm(p.x+1, p.y, numCols);
+			if(d_interiorPredicate[lm]==1) {
+				a += d_in[lm];
+			} else if(d_borderPredicate[lm]==1) {
+				b += d_dest[lm];
+			}
+			c += (sourceVal-d_source[lm]);
+		}
+		
+		if(withinBounds(p.x-1, p.y, numRows, numCols)) {
+			d++;
+			lm = getm(p.x-1, p.y, numCols);
+			if(d_interiorPredicate[lm]==1) {
+				a += d_in[lm];
+			} else if(d_borderPredicate[lm]==1) {
+				b += d_dest[lm];
+			}
+			c += (sourceVal-d_source[lm]);
+		}
+		
+		d_out[m] = min(255.f, max(0.0, (a + b + c)/d));
+	} else {
+		d_out[m] = d_dest[m];
+	}
+	
+}
 
 void your_blend(const uchar4* const h_sourceImg,  //IN
                 const size_t numRowsSource, const size_t numColsSource,
                 const uchar4* const h_destImg, //IN
                 uchar4* const h_blendedImg) //OUT
 {
+	// first push the dest and source onto the gpu
+	size_t imageSize = numRowsSource*numColsSource*sizeof(uchar4);
+	
+	uchar4* d_sourceImg;
+	uchar4* d_destImg;
+	uchar4* d_finalImg;
 
-  /* To Recap here are the steps you need to implement
-  
+	checkCudaErrors(cudaMalloc(&d_sourceImg, imageSize));
+	checkCudaErrors(cudaMalloc(&d_destImg, 	 imageSize));
+	checkCudaErrors(cudaMalloc(&d_finalImg,  imageSize));
+
+  	checkCudaErrors(cudaMemcpy(d_sourceImg, h_sourceImg, imageSize, cudaMemcpyHostToDevice));
+  	checkCudaErrors(cudaMemcpy(d_destImg, 	h_destImg, 	 imageSize, cudaMemcpyHostToDevice));
+
+	// allocate predicate stuff
+	size_t predicateSize = numRowsSource*numColsSource*sizeof(int);
+	int* d_borderPredicate;
+	int* d_interiorPredicate;
+
+	checkCudaErrors(cudaMalloc(&d_borderPredicate, 	 predicateSize));
+	checkCudaErrors(cudaMalloc(&d_interiorPredicate, predicateSize));
+
+	// make reusable dims
+	const dim3 blockSize(32, 32);
+    const dim3 gridSize(numColsSource/blockSize.x + 1, numRowsSource/blockSize.y + 1);
+
+
+	/**
      1) Compute a mask of the pixels from the source image to be copied
         The pixels that shouldn't be copied are completely white, they
         have R=255, G=255, B=255.  Any other pixels SHOULD be copied.
+	**/
 
+	/**
      2) Compute the interior and border regions of the mask.  An interior
         pixel has all 4 neighbors also inside the mask.  A border pixel is
         in the mask itself, but has at least one neighbor that isn't.
+	**/
 
-     3) Separate out the incoming image into three separate channels
+	// generate the predicates
+	maskPredicateKernel<<<gridSize, blockSize>>>(
+		d_sourceImg,
+		d_borderPredicate,
+		d_interiorPredicate,
+		numRowsSource,
+		numColsSource
+	);
 
+ 	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+ 	/**
+     3) Separate out the incoming image into three separate channels
+	**/
+	size_t floatSize = numRowsSource*numColsSource*sizeof(float);
+	float *d_sourceImgR, *d_sourceImgG, *d_sourceImgB; 
+	float *d_destImgR,   *d_destImgG, 	*d_destImgB;
+
+	checkCudaErrors(cudaMalloc(&d_sourceImgR, floatSize));
+	checkCudaErrors(cudaMalloc(&d_sourceImgG, floatSize));
+	checkCudaErrors(cudaMalloc(&d_sourceImgB, floatSize));
+	
+	checkCudaErrors(cudaMalloc(&d_destImgR, floatSize));
+	checkCudaErrors(cudaMalloc(&d_destImgG, floatSize));
+	checkCudaErrors(cudaMalloc(&d_destImgB, floatSize));
+	
+	separateChannelsKernel<<<gridSize, blockSize>>>(
+		d_sourceImg,
+		d_sourceImgR,
+		d_sourceImgG,
+		d_sourceImgB,
+		numRowsSource,
+		numColsSource);
+
+ 	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+	separateChannelsKernel<<<gridSize, blockSize>>>(
+		d_destImg,
+		d_destImgR,
+		d_destImgG,
+		d_destImgB,
+		numRowsSource,
+		numColsSource);
+
+ 	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+	/** 
      4) Create two float(!) buffers for each color channel that will
         act as our guesses.  Initialize them to the respective color
         channel of the source image since that will act as our intial guess.
+	**/
+	
+	// allocate floats
+	float *d_r0, *d_r1, *d_g0, *d_g1, *d_b0, *d_b1; 
+	checkCudaErrors(cudaMalloc(&d_r0, floatSize));
+	checkCudaErrors(cudaMalloc(&d_r1, floatSize));
+	checkCudaErrors(cudaMalloc(&d_b0, floatSize));
+	checkCudaErrors(cudaMalloc(&d_b1, floatSize));
+	checkCudaErrors(cudaMalloc(&d_g0, floatSize));
+	checkCudaErrors(cudaMalloc(&d_g1, floatSize));
 
+
+  	checkCudaErrors(cudaMemcpy(d_r0, d_sourceImgR, floatSize, cudaMemcpyDeviceToDevice));
+  	checkCudaErrors(cudaMemcpy(d_g0, d_sourceImgG, floatSize, cudaMemcpyDeviceToDevice));
+  	checkCudaErrors(cudaMemcpy(d_b0, d_sourceImgB, floatSize, cudaMemcpyDeviceToDevice));
+
+ 	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+	/**
      5) For each color channel perform the Jacobi iteration described 
         above 800 times.
-
+	**/
+	for(int i = 0; i < 800; i++) {
+		jacobiKernel<<<gridSize, blockSize>>>(
+			d_r0, 
+			d_r1,
+			d_borderPredicate,
+			d_interiorPredicate,
+			d_sourceImgR,
+			d_destImgR,
+			numRowsSource,
+			numColsSource
+		);
+		std::swap(d_r0, d_r1);
+
+		jacobiKernel<<<gridSize, blockSize>>>(
+			d_g0, 
+			d_g1,
+			d_borderPredicate,
+			d_interiorPredicate,
+			d_sourceImgG,
+			d_destImgG,
+			numRowsSource,
+			numColsSource
+		);
+		std::swap(d_g0, d_g1);
+
+		jacobiKernel<<<gridSize, blockSize>>>(
+			d_b0, 
+			d_b1,
+			d_borderPredicate,
+			d_interiorPredicate,
+			d_sourceImgB,
+			d_destImgB,
+			numRowsSource,
+			numColsSource
+		);
+		std::swap(d_b0, d_b1);
+	}
+
+	/**
      6) Create the output image by replacing all the interior pixels
         in the destination image with the result of the Jacobi iterations.
         Just cast the floating point values to unsigned chars since we have
         already made sure to clamp them to the correct range.
-
-      Since this is final assignment we provide little boilerplate code to
-      help you.  Notice that all the input/output pointers are HOST pointers.
-
-      You will have to allocate all of your own GPU memory and perform your own
-      memcopies to get data in and out of the GPU memory.
-
-      Remember to wrap all of your calls with checkCudaErrors() to catch any
-      thing that might go wrong.  After each kernel call do:
-
-      cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
-      to catch any errors that happened while executing the kernel.
-  */
+	**/
+
+	// lets assume that d_r0, d_g0, d_b0 are the final pass
+	recombineChannelsKernel<<<gridSize, blockSize>>>(
+		d_finalImg,
+		d_r0,
+		d_g0,
+		d_b0,
+		numRowsSource,
+		numColsSource);
+
+ 	cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+	// copy device final image to host
+  	checkCudaErrors(cudaMemcpy(h_blendedImg, d_finalImg, imageSize, cudaMemcpyDeviceToHost));
+
+	// cleanup
+  	checkCudaErrors(cudaFree(d_sourceImg));
+  	checkCudaErrors(cudaFree(d_destImg));
+	checkCudaErrors(cudaFree(d_finalImg));
+
+	checkCudaErrors(cudaFree(d_sourceImgR));
+	checkCudaErrors(cudaFree(d_sourceImgG));
+	checkCudaErrors(cudaFree(d_sourceImgB));
+
+	checkCudaErrors(cudaFree(d_destImgR));
+	checkCudaErrors(cudaFree(d_destImgG));
+	checkCudaErrors(cudaFree(d_destImgB));
+
+	checkCudaErrors(cudaFree(d_r0));
+	checkCudaErrors(cudaFree(d_r1));
+	checkCudaErrors(cudaFree(d_g0));
+	checkCudaErrors(cudaFree(d_g1));
+	checkCudaErrors(cudaFree(d_b0));
+	checkCudaErrors(cudaFree(d_b1));
 }
diff --git a/README.md b/README.md
index a5e3c9bf..9d68d23a 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1 @@
-cs344
-=====
-
-Introduction to Parallel Programming class code
-
-# Building on OS X
-
-These instructions are for OS X 10.9 "Mavericks".
-
-* Step 1. Build and install OpenCV. The best way to do this is with
-Homebrew. However, you must slightly alter the Homebrew OpenCV
-installation; you must build it with libstdc++ (instead of the default
-libc++) so that it will properly link against the nVidia CUDA dev kit. 
-[This entry in the Udacity discussion forums](http://forums.udacity.com/questions/100132476/cuda-55-opencv-247-os-x-maverick-it-doesnt-work) describes exactly how to build a compatible OpenCV.
-
-* Step 2. You can now create 10.9-compatible makefiles, which will allow you to
-build and run your homework on your own machine:
-```
-mkdir build
-cd build
-cmake ..
-make
-```
-
+My solutions to the Udacity cs344 Intro to Parallel Programming course. The course is taught using nVidia Cuda. For the most part (aside from the final project), my work is in the problem set folders under student_func.cu.