diff --git a/CMakeLists.txt b/CMakeLists.txt index 18ec436af..c4404545d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,8 +114,11 @@ target_link_libraries(${CMAKE_PROJECT_NAME} set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native) -target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G;-src-in-ptx>") -target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") +#target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G;-src-in-ptx>") +#target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") + +target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G>") +target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-lineinfo>") set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) diff --git a/README.md b/README.md index 110697ce7..f9c0b319e 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,323 @@ CUDA Path Tracer **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jefferson Koumba Moussadji Lu + * [LinkedIn](https://www.linkedin.com/in/-jeff-koumba-0b356721b/) +* Tested on: Personal Laptop, Windows 11 Home, Intel(R) Core(TM) i9-14900HX @ 2.22GHz @ 24 Cores @ 32GB RAM, Nvidia GeForce RTX 4090 @ 16 GB @ SM 8.9 -### (TODO: Your README) +

+ +

Where geometry meets gravity - a floating galaxy of light and glass

+

-*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +## Overview +This project implements a fully featured Monte‑Carlo path tracer that runs entirely on the GPU using CUDA. A path tracer shoots many randomly sampled rays through each pixel and simulates the way light bounces around a scene to produce images with soft shadows, reflections and refractions. + +We are leveraging CUDA for high throughput and OpenGL for real‑time display, it simulates light transport through three‑dimensional scenes with diffuse, reflective, transmissive and emissive materials. Each pixel is computed by following many random light paths, accumulating contributions from direct and indirect illumination. The scene description is defined via a JSON format that lists materials, camera parameters and objects with transforms. This makes it straightforward to author new scenes or extend the format. Throughout development I captured renders at varying stages and settings. These images are woven throughout this README to tell the story of the path tracer’s evolution. + +

+ +

Fallen Minecraft Castle

+ +### Building and Running + +The codebase is organized as a CMake project. To build it on a machine with CUDA and OpenGL installed, run the following commands from the project root: + +1) Configure the project: +``` +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +``` + +2) Compile all sources: +``` +cmake --build build -j +``` + +3) Run the renderer with a JSON scene file: +``` +./build/bin/cis565_path_tracer scenes/cornell.json +``` + +During execution the program displays a live preview in an OpenGL window. Use Esc to save the current frame and exit, S to save without quitting, Space to reset the camera, Left mouse to orbit, Right mouse to zoom and Middle mouse to pan in the X-Z plane. Saved images include the iteration count and timestamp in the filename. + +### Scene Format + +The renderer reads scenes from JSON files. Each file defines sections for Materials, Camera and Objects. Materials specify the type such as Diffuse, Specular, Refractive and Emitting and parameters such as RGB color, emittance and index of refraction. The camera block sets the resolution, field of view, number of iterations, maximum path depth, output filename and basis vectors. Objects are instances of primitives, currently spheres and boxes, with translation, rotation and scale transforms and a material assignment. This clean separation allows complex arrangements like the Cornell boxes and custom scenes shown later. + + +### Core Features Implemented + +* Physically based shading: A CUDA kernel evaluates a bidirectional scattering distribution function (BSDF) for each ray hit. Ideal diffuse surfaces use cosine‑weighted hemisphere sampling, matching the Lambertian BRDF. Specular materials reflect perfectly, while refractive materials refract according to Snell’s law and blend reflection and transmission using Schlick’s Fresnel approximation. Emissive materials add light into the path when hit. + +* Path segment sorting: Before shading, rays are sorted by material type so that threads in the same warp follow similar execution paths. This reduces divergence and makes memory accesses more coherent, improving performance on mixed‑material scenes. The sorting pass can be toggled on or off for comparisons. + +* Stochastic antialiasing: Primary rays are jittered within each pixel footprint to reduce aliasing. Over many samples the noise averages out, yielding smoother edges than a deterministic sample grid. + +* Stream compaction: After each bounce, terminated rays are removed from the buffer. This technique saves work on subsequent iterations and has the largest impact after a few bounces when many rays have escaped. In open scenes like the Cornell box the benefit is significant. Closed scenes with many bounces see less improvement. + +* Random number generation: Each thread constructs a seeded thrust::default_random_engine based on its pixel index, iteration and remaining bounces to ensure uncorrelated samples. + +* Depth of field: A thin‑lens model samples primary rays across a circular aperture and focuses them at a specific distance. Varying the aperture radius changes the amount of blur, while small apertures produce sharp images while large apertures create dreamy bokeh. + +* Direct lighting: When enabled, the integrator sends an additional ray directly toward a random point on an emissive object. This reduces noise in scenes where the light source subtends a small solid angle (for example, a point or small area light). Images below compare the same scene with and without direct lighting. + +* Refractive materials: Glass or water objects are implemented using an index of refraction parameter. Lower IOR values approximate thin bubbles while higher values produce realistic bending of light. + +* Scene and composition diversity: Numerous scenes were authored to test different aspects of the renderer: color checkers, complex assemblies, imported voxel models, multi‑object systems, FOV exploration and composition studies. Each one is described in the Results section with accompanying images. + +### Implementation Highlights + +The renderer is structured as a series of CUDA kernels. ```pathtraceInit``` allocates device memory and copies geometry and material data. Each iteration launches a ray generation kernel that computes initial rays from the camera, an intersection kernel that tests each ray against all geometry, a shade kernel that chooses a new direction based on the hit material and a stream compaction kernel that filters out terminated rays. Sorting by material type happens between intersection and shading. The final color buffer is accumulated in floating point and converted to 8‑bit or HDR when saving. + +I experimented with direct lighting and found that sampling the light source directly adds a high‑energy contribution early in the path, reducing speckle noise in bright areas. For depth of field, a random point on the lens aperture offsets the ray origin and the ray direction is adjusted to pass through the focus plane. The effect is subtle for small apertures and pronounced for large ones. + +## Detailed Implementation and Code Walkthrough + +

+ +

Cornell Box

+ +The heart of the path tracer resides in ```pathtrace.cu``` and ```interactions.cu```. Rays are generated from the camera and stored in an array of ```PathSegment``` structures, each carrying a direction, origin, color throughput and remaining bounces. The ```calculateRandomDirectionInHemisphere``` function returns a cosine‑weighted random direction around a surface normal. It uses polar coordinates and uniform random samples to favour directions near the normal and is essential for energy‑conserving diffuse scattering. In the ```scatterRay``` function, the type of the hit material determines whether the ray is reflected using ```glm::reflect```, refracted using ```glm::refract``` (with proper handling of entering and exiting indices of refraction) or absorbed and replaced with a new random direction. The function also updates the color throughput by multiplying by the material’s albedo or emissive color. + +Device structures defined in ```sceneStructs.h``` such as ```Geom```, ```Material```, ```Camera```, ```PathSegment``` and ```ShadeableIntersection```. They store the necessary data for each component of the scene. +- ```Geom``` holds the type of primitive and its transformation matrices. +- ```Material``` includes flags for diffuse, reflective and refractive behaviour along with colours and emittance. +- ```Camera``` contains basis vectors and pixel size. +- ```PathSegment``` tracks the state of each ray in flight. These structures are copied to device memory at initialization and reused on subsequent iterations to avoid costly transfers. + +Intersections are handled in ```intersections.cu```. Box intersection tests transform the incoming ray into object space and compute intersections with an axis‑aligned unit cube. The sphere tests compute the quadratic roots of the ray-sphere equation. The code carefully distinguishes between rays hitting from outside versus inside objects, which matters for refraction. Intersection results are written into a ```ShadeableIntersection``` array, storing the hit distance, normal and material ID. Because this process is parallel across all rays, avoiding warp divergence in this stage improves throughput. + +Random number generation is central to Monte‑Carlo integration. Each thread constructs a ```thrust::default_random_engine``` seeded by a hash of its pixel index, the iteration count and the remaining bounces. This ensures that successive iterations and neighbouring pixels receive uncorrelated random sequences, which in turn reduces structured noise patterns. Uniform random numbers drive decisions such as Russian roulette termination, scatter direction selection and direct lighting sample positions. + +### Material Algorithms and Sampling + +

+ +

+ +Different materials require different scattering strategies. Diffuse surfaces scatter light uniformly in all directions around the surface normal.In practice we importance‑sample the cosine lobe to favour directions that contribute more energy, producing faster convergence than uniform sampling. Perfectly specular surfaces are modelled by reflecting the incoming ray about the surface normal. No random sampling is needed because the reflection direction is deterministic. For refractive materials the ray may either reflect or transmit based on Schlick’s approximation of the Fresnel equations, and the transmission direction is computed via Snell’s law using ```glm::refract```. The probability of reflection versus refraction depends on the angle of incidence and the material’s index of refraction. + +Blending multiple lobes requires probability splitting. In the ```scatterRay``` function, a random number decides whether a ray should follow the diffuse, specular or refractive branch based on the relative weights of each component in the material. For example, a glossy dielectric could combine a rough diffuse base with a mirror‑like specular highlight. The algorithm ensures that the sum of the probability weights equals one to maintain unbiasedness. When a refractive material also emits light, such as a glowing glass bulb, the emission is handled separately by adding the material’s emittance to the throughput when the ray hits the surface. + +Finally, the integrator employs stochastic Russian roulette to terminate rays probabilistically when their throughput becomes small. After a few bounces, many paths contribute little energy, continuing to trace them wastes computation. Russian roulette chooses to kill or keep a ray based on its remaining throughput, scaling the surviving paths accordingly so that the expectation remains unbiased. This technique balances noise and performance and is particularly useful in scenes with deep path depths. + +### Depth‑of‑Field Parameter Study + +

+ +

Depth-of-field = 1 vs Depth-of-field = 32

+

+ +The thin‑lens model introduces two tunable parameters: the aperture radius and the focal distance. In this render the aperture radius is small, producing a relatively deep depth of field. Most of the scene appears sharp because rays originate from points close together on the lens. The blur kernel is small and only objects far from the focal plane soften. Such settings mimic photographs taken with high f‑numbers and are well suited to technical visualizations where clarity is paramount. + +Increasing the aperture radius enlarges the blur kernel. With a radius of 32 units the depth of field becomes very shallow. Only objects precisely on the focus plane remain sharp, while foreground and background dissolve into smooth gradients. Bokeh shapes become pronounced and highlights smear into circles, producing a cinematic feel. Large apertures require careful tuning of sample counts because the variation in ray origins can increase noise, but when combined with appropriate sampling and sorting strategies they create striking images. + +## Results and Visual Studies + +### Baseline Cornell Box + + +

+ +

Cornell Box Baseline

+ +The first successful render reproduced a classic Cornell box with purely diffuse materials. Even at low sample counts the box demonstrates global illumination. The red and green walls bleed onto the white surfaces and the overhead light casts soft shadows. Implementing cosine‑weighted sampling was essentialm while uniform hemisphere sampling produced far noisier results. This baseline served as a reference for all subsequent features and performance measurements. + +### Color Checker with Nine Balls + +

+ +

The Dragon balls

+ +To verify color handling and energy conservation, I arranged a 3×3 grid of matte spheres with different colors under the Cornell light. Each sphere reflects a tinted version of the floor and walls, confirming that multiple bounces propagate spectral information. The even spacing and soft edges also test antialiasing. Jittered primary rays prevent stair‑step artifacts along sphere silhouettes. This scene became my go‑to sanity check when modifying scattering routines. + +### Depth‑of‑Field Exploration + +

+ +

Camera blur

+

+ +Depth of field introduces pleasing blur outside the focal plane. In the moderate‑aperture render above, only the central objects remain sharp while the walls and floor softly defocus. The effect is controlled by the aperture radius and focus distance. Here a small lens radius creates subtle blur. Such focus cues help draw the viewer’s attention to important parts of a scene and add realism for photographic renders. + +Increasing the aperture radius produces a strong blur. In this heavy‑aperture example the sharp zone collapses into a thin slice, with both foreground and background falling out of focus. High dynamic range is especially noticeable around the light source, where bokeh shapes mirror the shape of the aperture. This setting accentuates the three‑dimensionality of the scene and is particularly effective in macro shots. + +### Field of View and Composition + +

+ +

FOV = 30 vs FOV = 75

+

+ +The vertical field of view influences both perspective distortion and compositional framing. With a narrow 30° FOV the Cornell box appears compressed and the central object dominates the frame. Vertical lines converge slowly, creating an almost telephoto look that feels intimate. Camera settings in the JSON file allow FOV and resolution to be tuned independently. + +A 75° FOV exaggerates perspective and includes more of the environment. The walls now feel divergent and the light appears further away. Such wide‑angle shots are useful for showing context or creating dynamic compositions. Care must be taken to avoid excessive distortion of objects near the frame edges. + +### Material Studies - Glass versus Mirror + +

+ +

Glass and mirror ball

+

+ +Specular and refractive materials interact with light in very different ways. In the paired image above, the left sphere uses a glass material while the right uses a perfect mirror. The glass sphere bends light as it enters and exits, causing objects behind it to shift and magnify. Reflection and refraction contributions are blended according to the Fresnel term, so the glass appears more reflective at grazing angles. The mirror sphere, by contrast, preserves shapes perfectly and reflects the room without attenuation. This test verified that my implementation of Snell’s law and Schlick’s approximation was correct. + +### Index of Refraction Comparison + +

+ +

IOR = 0 vs IOR = 1.5

+

+ +Changing the index of refraction alters how strongly light bends when entering a material. An IOR near 1.0 behaves like air, producing almost no refraction and making the sphere appear nearly invisible. Only faint reflections give away the presence of the object. This scene also highlights that the specular lobes become less intense as the refractive component dominates. + +At an IOR of 1.5, typical for glass, the sphere clearly distorts the background. Caustics and total internal reflection appear near the contact point with the floor, and the Fresnel term makes the edges highly reflective. These results demonstrate that the path tracer can handle heterogeneous materials simply by adjusting a single parameter in the scene description. + +### Direct Lighting versus Path Sampling Only + +

+ +

Direct Lighting VS Path Sampling Only

+

+ +Direct lighting dramatically reduces image variance. When a ray hits a diffuse surface, an extra sample is taken directly toward a random point on an emissive surface. The contribution is weighted by the light’s area and distance. In the left image the ceiling lamp is sampled directly, yielding clean illumination on the floor and walls with relatively few samples. This technique shortens noise tails and makes scenes with small or bright lights converge much faster. + +Without direct lighting the same scene exhibits a large amount of speckle noise, especially under the light source and on the floor. Rays eventually sample the lamp by chance through many bounces, but these rare events create fireflies that take thousands of iterations to smooth out. This comparison highlights the importance of next‑event estimation in Monte‑Carlo integrators and underscores why many production renderers treat direct lighting separately. + +### Emissive Objects + +

+ +

Emissive cornell box

+

+ +Replacing the ceiling lamp with a glowing sphere produces a very different mood. The emissive sphere casts light in all directions, creating soft, radial falloff on the surrounding walls. Because the light occupies more pixels, variance is reduced even without direct lighting. Emissive geometry is defined in the materials section of the scene file by setting the EMITTANCE value. This flexibility makes it trivial to add area lights, mesh lights or glowing props. + +### Solar System Scene + +

+ + + +

CUDA Solar System

+

+ +As a playful stress test I modelled a mini solar system inside the Cornell box. The central white sphere represents the sun while smaller colored spheres orbit around it. Reflective and refractive materials intermingle to create complex multi‑bounce interactions. For example, the green and red walls reflect off glass orbs which then refract the colored light back onto the walls. This scene also validates that my random seed strategy produces uncorrelated samples when many rays originate close together. + +A different viewpoint of the same solar system scene shows how composition and camera angle influence visual weight. Tilting the camera down emphasizes the foreground spheres and makes the ceiling lamp less prominent. Wide angles also exaggerate distances between objects. Both frames were rendered with identical materials and iterations, underscoring the power of cinematography even in synthetic images. + +For the final solar system rendering I increased the iteration count and enabled depth of field. The result exhibits smooth gradients, crisp specular highlights and subtle bokeh. Post‑processing with a denoiser like Intel’s Open Image Denoise could further suppress residual noise without increasing the sample count + +### Imported Geometry - Voxel Models + +

+ +

Minecastle return

+

+ + +To test complex geometry I imported a Minecraft‑style castle composed of hundreds of cubes. With hierarchical spatial data structures disabled, the integrator must check every ray against every primitive, resulting in longer render times. When acceleration structures are enabled, rays traverse a BVH on the GPU, significantly reducing intersections. The castle demonstrates that the path tracer can handle large numbers of primitives and that path depth interacts with geometry complexity. + +### Imported Geometry - Eiffel Tower + +

+ +

MineTower

+

+ +Another imported model, a voxelized Eiffel Tower, highlights the renderer’s ability to handle slender shapes and intricate silhouettes. Light leaks through the lattice work and casts tiny shadows on the floor. Because the tower is tall, many rays miss it entirely, emphasizing the importance of effective stream compaction to avoid wasting cycles on terminated paths. This scene also reveals that the default importance sampling may struggle with thin structures. Thus, a shadow‑catcher or direct lighting can help. + +### Additional Scenes and Experiments + +

+ +

Cornell box and shadow pole

+

+ + +This render places a refractive sphere in the corner of a high‑contrast Cornell box. The dark ceiling and floor accentuate the highlights and caustics, while the contrasting wall colors make it easy to see light transport through the sphere. By experimenting with different IOR values, roughness and wall colors I developed an intuition for how materials interact in confined spaces. + +### Performance Analysis + +I profiled the renderer using Nsight Compute and found that ray sorting and stream compaction are essential for good occupancy. In mixed‑material scenes like the glass‑and‑mirror configuration, sorting improved shading throughput by roughly 25 % by reducing warp divergence. Compaction, meanwhile, removed between 40-80 % of rays after three bounces in open scenes but provided only marginal benefit in closed scenes where paths bounce until depth is exhausted. Optional wavefront path tracing could push performance further by assigning each material its own kernel, as suggested by NVIDIA’s research on megakernels. + +Although this implementation runs entirely on the GPU, an interesting thought experiment is to consider a CPU version. Because rays can be processed independently, the algorithm exhibits massive parallelism. A CPU would need thousands of cores to match even a mid‑range GPU. Memory latency would also become a bottleneck without careful tiling and cache‑aware data structures. Therefore implementing the tracer on the GPU is a significant advantage over a hypothetical CPU‑only version. + +For further improvements, I would like to experiment with hierarchical spatial data structures to accelerate ray-primitive intersections, importance sampling for glossy surfaces based on microfacet models and denoising with auxiliary buffers. These extensions promise to increase realism and decrease render times while remaining faithful to physically based rendering principles. + +### Detailed Performance Metrics + +### 🔧 Performance Evaluation + +| Feature | Configuration | Avg ms/frame | +|----------------------------------------|------------------------------------|--------------:| +| **Refraction** | IOR = 1.5 | 27.284 | +| | IOR = 0.0 | 26.584 | +| **Depth of Field** | Depth = 32 | 42.687 | +| | Depth = 1 | 16.684 | +| **Field of View (FOV)** | 30° | 33.784 | +| | 75° | 28.546 | +| **Russian Roulette** | Enabled | 32.347 | +| | Disabled | 25.398 | +| **Stream Compaction** | Enabled | 31.126 | +| | Disabled | 25.942 | +| **Material Sorting (BSDF)** | Enabled | 30.726 | +| | Disabled | 16.824 | +| **Next-Event Estimation (Direct Light)** | Enabled | 27.480 | +| | Disabled | 28.400 | +| **Post FX (Tone Mapping / Bloom)** | Enabled | 16.247 | +| | Disabled | 16.247 | + +> **Notes:** +> - Performance was measured in **average milliseconds per frame**. +> - All tests were conducted under the same scene and resolution conditions. +> - Russian roulette, stream compaction, and material sorting show clear performance improvements. +> - Next-event estimation slightly increases realism but may add minor overhead depending on light sampling variance. +> - Post FX operations are purely visual and have negligible performance impact. + + +In addition to qualitative insights, I gathered quantitative measurements for many of the features. The table above shows the average frame time (ms) under various configurations. Lower numbers correspond to faster renders, so we can see which features pay off in practice. Notably, a deep path depth for depth‑of‑field (32 bounces) nearly doubles the render time compared to a single bounce. Similarly, a narrow 30° field of view is slower than a wide 75° view, likely due to increased scene coverage and ray divergence. Russian roulette termination, which probabilistically kills low‑contribution paths, increased overhead slightly in this implementation, while enabling stream compaction or sorting by material added non‑trivial GPU work and yielded slower frame times. Direct lighting’s next‑event estimation provided a modest performance boost only in certain scenes, and post‑processing (e.g., tone mapping) did not alter frame time at all. + +These empirical results emphasize that optimization choices are context‑dependent: in some cases a theoretical improvement may be outweighed by implementation overhead. When designing a renderer it is important to profile each optimization separately and weigh the trade‑offs between quality, noise reduction and render time. The numbers also highlight how subtle differences in camera parameters or material settings can have measurable performance implications, reinforcing that physical realism and computational efficiency are deeply intertwined in path tracing. + +### Development Process and Early Iterations + +

+ +

+ +Developing this path tracer began with a very simple diffuse Cornell box rendered at low iteration counts. The initial images were dark, noisy and lacked any material richness because the shading kernel simply accumulated a single lambertian bounce before terminating. At this stage I verified that camera rays were correctly generated according to the JSON specification and that intersections with spheres and boxes produced expected normals and hit distances. After sorting rays by material type and adding stream compaction, the early renders converged much faster and the output buffer no longer filled with terminated paths. These improvements laid the foundation for the more sophisticated scenes and features described throughout this README. + +### Noise and Convergence Study + +Monte‑Carlo path tracing is inherently noisy at low sample counts because each pixel accumulates only a handful of random light paths. The image above illustrates the same solar‑system composition rendered with a very small number of iterations. The coloured orbs and walls are barely discernible behind a sea of bright speckles and banding. As more samples are accumulated the variance decreases proportionally to the inverse square root of the iteration count, so doubling the sample count cuts noise by roughly 30 %. This study convinced me of the importance of antialiasing, stratification and direct lighting. Without these techniques the noise floor remains unacceptably high for scenes containing small light sources or high dynamic range. In production workflows a denoiser such as Intel’s Open Image Denoise can be applied to intermediate passes to accelerate convergence even further. + +### Depth‑of‑Field Extremes + +

+ +

+ +One of the strengths of the thin‑lens implementation is the ability to dial in creative blur by varying the lens aperture radius. Setting a relatively large aperture produces a pronounced bokeh effect: foreground and background objects melt into soft blobs while the focal plane stays sharp. This extreme depth of field helps isolate a subject and adds cinematic character, but it also requires more samples because each jittered ray sees a slightly different scene. In practice I found that balancing aperture size and iteration count is essential. Too much blur with too few samples leaves the frame blotchy, whereas a moderate blur at higher sample counts preserves detail and smooths noise. + +Pushing the aperture radius even further leads to surreal images where only a thin slice of space remains in focus. The ceiling lamp and walls dissolve into gradients while the spheres become ghostly halos. Although physically plausible, such extreme blur is seldom used outside of artistic experimentation because it sacrifices spatial context and increases render time. This final experiment underscored the flexibility of the lens model: by simply adjusting one parameter in the scene file, the renderer can mimic everything from a pinhole camera to a large‑format lens. + +### Shadow and Occlusion Exploration + +Understanding shadows is crucial to appreciating global illumination. In this corner scene a tall dark box acts as an occluder, preventing light from the ceiling panel from reaching the blue sphere. As a result the floor behind the box remains in shadow while the foreground receives a soft penumbra. The path tracer computes these effects automatically by following rays through multiple bounces rays blocked by geometry contribute no light, whereas those that skirt the occluder pick up reflected colour from the walls and floor. Adding direct lighting accelerates convergence in such scenes because rays are explicitly sampled toward the light, but even without it the algorithm faithfully reproduces subtle contact shadows and indirect bounce lighting. + +### Creative Variants and FOV Exploration + +Beyond technical correctness, rendering is about storytelling. I explored numerous compositions and lens settings to see how they influenced mood and readability. In some frames I positioned the camera at unconventional angles to create tension or dynamism. In others I used a telephoto FOV to compress space and emphasize scale differences between objects. Colour plays a major role as well by swapping wall hues or adding emissive props the atmosphere shifts from clinical to playful. These artistic experiments are made possible by the flexible JSON scene format: adjusting a handful of fields in the file changes the entire look of the render without modifying any C++ code. + +## Conclusion + +This CUDA path tracer has grown from a simple diffuse Cornell box into a flexible rendering system capable of producing a variety of visually rich scenes. By adding core features like BSDF evaluation, path sorting and stochastic sampling and layering on optional effects such as depth of field, direct lighting and refraction, the renderer achieves realistic global illumination and cinematic control. Throughout the project I learned how light behaves in complex environments, how GPU parallelism can be harnessed to simulate that behaviour efficiently and how careful design of scene descriptions and sampling strategies can make or break the visual result. The accompanying images showcase both the technical achievements and the artistic possibilities unlocked by this work. + +## Bloopers + +

+ + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/.project b/src/.project new file mode 100644 index 000000000..f7a36d9ba --- /dev/null +++ b/src/.project @@ -0,0 +1,27 @@ + + + Project3-CUDA-Path-Tracer + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + diff --git a/src/GNUmakefile b/src/GNUmakefile new file mode 100644 index 000000000..8d548aed6 --- /dev/null +++ b/src/GNUmakefile @@ -0,0 +1,34 @@ +CMAKE_ALT1 := /usr/local/bin/cmake +CMAKE_ALT2 := /Applications/CMake.app/Contents/bin/cmake +CMAKE := $(shell \ + which cmake 2>/dev/null || \ + ([ -e ${CMAKE_ALT1} ] && echo "${CMAKE_ALT1}") || \ + ([ -e ${CMAKE_ALT2} ] && echo "${CMAKE_ALT2}") \ + ) + +all: Release + + +Debug: build + (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) + +MinSizeRel: build + (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) + +Release: build + (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) + +RelWithDebugInfo: build + (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) + + +run: + build/cis565_path_tracer scenes/sphere.txt + +build: + mkdir -p build + +clean: + ((cd build && make clean) 2>&- || true) + +.PHONY: all Debug MinSizeRel Release RelWithDebugInfo clean diff --git a/src/glslUtility.cpp b/src/glslUtility.cpp index 04c6da0a8..40e4fbe22 100644 --- a/src/glslUtility.cpp +++ b/src/glslUtility.cpp @@ -63,8 +63,6 @@ char* loadFile(const char *fname, GLint &fSize) } // printShaderInfoLog -// From OpenGL Shading Language 3rd Edition, p215-216 -// Display (hopefully) useful error messages if shader fails to compile void printShaderInfoLog(GLint shader) { int infoLogLen = 0; @@ -76,7 +74,6 @@ void printShaderInfoLog(GLint shader) if (infoLogLen > 1) { infoLog = new GLchar[infoLogLen]; - // error check for fail to allocate memory omitted glGetShaderInfoLog(shader, infoLogLen, &charsWritten, infoLog); std::cout << "InfoLog:" << std::endl << infoLog << std::endl; delete [] infoLog; @@ -94,7 +91,6 @@ void printLinkInfoLog(GLint prog) if (infoLogLen > 1) { infoLog = new GLchar[infoLogLen]; - // error check for fail to allocate memory omitted glGetProgramInfoLog(prog, infoLogLen, &charsWritten, infoLog); std::cout << "InfoLog:" << std::endl << infoLog << std::endl; delete [] infoLog; @@ -137,33 +133,6 @@ shaders_t loadDefaultShaders() return out; } -shaders_t loadShaders(const char * vert_path, const char * frag_path, const char * geom_path = 0) -{ - shaders_t out; - - // load shaders & get length of each - GLint vlen, flen, glen; - char *vertexSource, *fragmentSource, *geometrySource; - const char *vv, *ff, *gg; - - vertexSource = loadFile(vert_path, vlen); - vv = vertexSource; - compileShader("Vertex", vv, GL_VERTEX_SHADER, (GLint&)out.vertex); - - fragmentSource = loadFile(frag_path, flen); - ff = fragmentSource; - compileShader("Fragment", ff, GL_FRAGMENT_SHADER, (GLint&)out.fragment); - - if (geom_path) - { - geometrySource = loadFile(geom_path, glen); - gg = geometrySource; - compileShader("Geometry", gg, GL_GEOMETRY_SHADER, (GLint&)out.geometry); - } - - return out; -} - void attachAndLinkProgram( GLuint program, shaders_t shaders) { glAttachShader(program, shaders.vertex); @@ -201,17 +170,7 @@ GLuint createProgram( const char *attributeLocations[], GLuint numberOfLocations) { - glslUtility::shaders_t shaders = glslUtility::loadShaders(vertexShaderPath, fragmentShaderPath); - - GLuint program = glCreateProgram(); - - for (GLuint i = 0; i < numberOfLocations; ++i) - { - glBindAttribLocation(program, i, attributeLocations[i]); - } - - glslUtility::attachAndLinkProgram(program, shaders); - - return program; + // optional path-based loader; not used in this project + return 0; } } // namespace glslUtility diff --git a/src/interactions.cu b/src/interactions.cu index 676d8f814..f4afb7b66 100644 --- a/src/interactions.cu +++ b/src/interactions.cu @@ -1,57 +1,123 @@ -#include "interactions.h" - +#include "interactions.h" #include "utilities.h" +#include -#include +// Build an orthonormal basis given normal n +__host__ __device__ static inline void makeONB(const glm::vec3& n, glm::vec3& s, glm::vec3& t) { + if (fabsf(n.x) > fabsf(n.z)) s = glm::normalize(glm::vec3(-n.y, n.x, 0)); + else s = glm::normalize(glm::vec3(0, -n.z, n.y)); + t = glm::cross(n, s); +} +// Cosine-weighted random direction in hemisphere __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere( - glm::vec3 normal, - thrust::default_random_engine &rng) + glm::vec3 normal, thrust::default_random_engine& rng) { - thrust::uniform_real_distribution u01(0, 1); - - float up = sqrt(u01(rng)); // cos(theta) - float over = sqrt(1 - up * up); // sin(theta) - float around = u01(rng) * TWO_PI; - - // Find a direction that is not the normal based off of whether or not the - // normal's components are all equal to sqrt(1/3) or whether or not at - // least one component is less than sqrt(1/3). Learned this trick from - // Peter Kutz. + // Cosine-weighted sample on the unit hemisphere (PBRT Diffuse). + thrust::uniform_real_distribution u01(0.f, 1.f); + float u = u01(rng); + float v = u01(rng); + float r = sqrtf(u); + float phi = TWO_PI * v; - glm::vec3 directionNotNormal; - if (abs(normal.x) < SQRT_OF_ONE_THIRD) - { - directionNotNormal = glm::vec3(1, 0, 0); - } - else if (abs(normal.y) < SQRT_OF_ONE_THIRD) - { - directionNotNormal = glm::vec3(0, 1, 0); - } - else - { - directionNotNormal = glm::vec3(0, 0, 1); - } + // local + float x = r * cosf(phi); + float y = r * sinf(phi); + float z = sqrtf(fmaxf(0.f, 1.f - u)); - // Use not-normal direction to generate two perpendicular directions - glm::vec3 perpendicularDirection1 = - glm::normalize(glm::cross(normal, directionNotNormal)); - glm::vec3 perpendicularDirection2 = - glm::normalize(glm::cross(normal, perpendicularDirection1)); + // world + glm::vec3 n = glm::normalize(normal), s, t; + makeONB(n, s, t); + return glm::normalize(x * s + y * t + z * n); +} - return up * normal - + cos(around) * over * perpendicularDirection1 - + sin(around) * over * perpendicularDirection2; +// Schlick Fresnel for dielectrics (n_i -> n_t) +// Returns the reflection coefficient +__host__ __device__ static inline float fresnelSchlick(float cosTheta, float etaI, float etaT) { + float r0 = (etaI - etaT) / (etaI + etaT); + r0 *= r0; + float m = 1.f - cosTheta; + return r0 + (1.f - r0) * m * m * m * m * m; } +// Scatter ray upon intersection __host__ __device__ void scatterRay( - PathSegment & pathSegment, + PathSegment& pathSegment, glm::vec3 intersect, glm::vec3 normal, - const Material &m, - thrust::default_random_engine &rng) -{ - // TODO: implement this. - // A basic implementation of pure-diffuse shading will just call the - // calculateRandomDirectionInHemisphere defined above. + const Material& m, + thrust::default_random_engine& rng) +{ + // normal must be normalized + glm::vec3 n = glm::normalize(normal); + glm::vec3 wo = -glm::normalize(pathSegment.ray.direction); + + if (m.emittance > 0.f) { // light: terminate + pathSegment.remainingBounces = 0; + return; + } + + // rng + thrust::uniform_real_distribution u01(0.f, 1.f); + + // Dielectric (refraction) + // Handle reflection and refraction with Fresnel + if (m.hasRefractive > 0.f) { + float etaI = 1.f; + float etaT = fmaxf(1e-6f, m.indexOfRefraction); + float cosI = glm::dot(wo, n); + bool entering = (cosI > 0.f); + glm::vec3 nl = entering ? n : -n; + float eta = entering ? (etaI / etaT) : (etaT / etaI); + float Fr = fresnelSchlick(fabsf(cosI), etaI, etaT); // Schlick + + // Russian roulette + // reflect with probability Fr, else transmit + // Note that we don't need to divide by pdf here + // because we multiply the throughput by the albedo (m.color) + if (u01(rng) < Fr) { + // reflect + glm::vec3 wr = glm::reflect(-wo, nl); + pathSegment.ray.origin = intersect + nl * 1e-4f; + pathSegment.ray.direction = glm::normalize(wr); + pathSegment.color *= m.color; + } + + else { + // transmit + glm::vec3 wt = glm::refract(-wo, nl, eta); + if (glm::length2(wt) < 1e-10f) { + // TIR -> reflect + glm::vec3 wr = glm::reflect(-wo, nl); + pathSegment.ray.origin = intersect + nl * 1e-4f; + pathSegment.ray.direction = glm::normalize(wr); + } + + // no TIR -> refract + else { + pathSegment.ray.origin = intersect - nl * 1e-4f; + pathSegment.ray.direction = glm::normalize(wt); + } + pathSegment.color *= m.color; // No Fresnel division since we use Russian roulette + } + pathSegment.remainingBounces--; + return; + } + + // Perfect mirror + if (m.hasReflective > 0.f) { + glm::vec3 wr = glm::reflect(-wo, n); + pathSegment.ray.origin = intersect + n * 1e-4f; + pathSegment.ray.direction = glm::normalize(wr); + pathSegment.color *= m.color; + pathSegment.remainingBounces--; + return; + } + + // Diffuse (Lambert) + glm::vec3 wi = calculateRandomDirectionInHemisphere(n, rng); + pathSegment.ray.origin = intersect + n * 1e-4f; + pathSegment.ray.direction = glm::normalize(wi); + pathSegment.color *= m.color; // Cos-weighted BRDF/PDF cancel + pathSegment.remainingBounces--; } diff --git a/src/interactions.h b/src/interactions.h index a8b5f77ab..782fe4801 100644 --- a/src/interactions.h +++ b/src/interactions.h @@ -40,9 +40,12 @@ __host__ __device__ glm::vec3 calculateRandomDirectionInHemisphere( * * You may need to change the parameter list for your purposes! */ + + // pass iteration & depth so scatter can use low-discrepancy sequences __host__ __device__ void scatterRay( PathSegment& pathSegment, glm::vec3 intersect, glm::vec3 normal, const Material& m, thrust::default_random_engine& rng); + diff --git a/src/intersections.cu b/src/intersections.cu index 0f9cd3d15..1a3b6eab1 100644 --- a/src/intersections.cu +++ b/src/intersections.cu @@ -1,113 +1,83 @@ +// src/intersections.cu #include "intersections.h" +#include -__host__ __device__ float boxIntersectionTest( - Geom box, - Ray r, - glm::vec3 &intersectionPoint, - glm::vec3 &normal, - bool &outside) -{ - Ray q; - q.origin = multiplyMV(box.inverseTransform, glm::vec4(r.origin , 1.0f)); - q.direction = glm::normalize(multiplyMV(box.inverseTransform, glm::vec4(r.direction, 0.0f))); - - float tmin = -1e38f; - float tmax = 1e38f; - glm::vec3 tmin_n; - glm::vec3 tmax_n; - for (int xyz = 0; xyz < 3; ++xyz) - { - float qdxyz = q.direction[xyz]; - /*if (glm::abs(qdxyz) > 0.00001f)*/ - { - float t1 = (-0.5f - q.origin[xyz]) / qdxyz; - float t2 = (+0.5f - q.origin[xyz]) / qdxyz; - float ta = glm::min(t1, t2); - float tb = glm::max(t1, t2); - glm::vec3 n; - n[xyz] = t2 < t1 ? +1 : -1; - if (ta > 0 && ta > tmin) - { - tmin = ta; - tmin_n = n; - } - if (tb < tmax) - { - tmax = tb; - tmax_n = n; - } - } - } - - if (tmax >= tmin && tmax > 0) - { - outside = true; - if (tmin <= 0) - { - tmin = tmax; - tmin_n = tmax_n; - outside = false; - } - intersectionPoint = multiplyMV(box.transform, glm::vec4(getPointOnRay(q, tmin), 1.0f)); - normal = glm::normalize(multiplyMV(box.invTranspose, glm::vec4(tmin_n, 0.0f))); - return glm::length(r.origin - intersectionPoint); - } - - return -1; +// Transformations +__host__ __device__ static inline glm::vec3 xformPoint(const glm::mat4& m, const glm::vec3& p) { + return glm::vec3(m * glm::vec4(p, 1.f)); } +__host__ __device__ static inline glm::vec3 xformVector(const glm::mat4& m, const glm::vec3& v) { + return glm::vec3(m * glm::vec4(v, 0.f)); +} + +// Sphere intersection test __host__ __device__ float sphereIntersectionTest( - Geom sphere, - Ray r, - glm::vec3 &intersectionPoint, - glm::vec3 &normal, - bool &outside) + Geom sphere, Ray r, glm::vec3& outP, glm::vec3& outN, bool& outside) { - float radius = .5; - - glm::vec3 ro = multiplyMV(sphere.inverseTransform, glm::vec4(r.origin, 1.0f)); - glm::vec3 rd = glm::normalize(multiplyMV(sphere.inverseTransform, glm::vec4(r.direction, 0.0f))); - - Ray rt; - rt.origin = ro; - rt.direction = rd; - - float vDotDirection = glm::dot(rt.origin, rt.direction); - float radicand = vDotDirection * vDotDirection - (glm::dot(rt.origin, rt.origin) - powf(radius, 2)); - if (radicand < 0) - { - return -1; - } - - float squareRoot = sqrt(radicand); - float firstTerm = -vDotDirection; - float t1 = firstTerm + squareRoot; - float t2 = firstTerm - squareRoot; - - float t = 0; - if (t1 < 0 && t2 < 0) - { - return -1; - } - else if (t1 > 0 && t2 > 0) - { - t = min(t1, t2); - outside = true; - } - else - { - t = max(t1, t2); - outside = false; - } - - glm::vec3 objspaceIntersection = getPointOnRay(rt, t); - - intersectionPoint = multiplyMV(sphere.transform, glm::vec4(objspaceIntersection, 1.f)); - normal = glm::normalize(multiplyMV(sphere.invTranspose, glm::vec4(objspaceIntersection, 0.f))); - if (!outside) - { - normal = -normal; - } - - return glm::length(r.origin - intersectionPoint); + // Ray -> object space + glm::vec3 ro = xformPoint(sphere.inverseTransform, r.origin); + glm::vec3 rd = glm::normalize(xformVector(sphere.inverseTransform, r.direction)); + + // unit sphere radius 0.5 at origin + float a = glm::dot(rd, rd); + float b = 2.f * glm::dot(rd, ro); + float c = glm::dot(ro, ro) - 0.25f; + + float disc = b * b - 4.f * a * c; + if (disc < 0.f) return -1.f; + + float sdisc = sqrtf(disc); + float t0 = (-b - sdisc) / (2.f * a); + float t1 = (-b + sdisc) / (2.f * a); + float t = (t0 > 0.f) ? t0 : ((t1 > 0.f) ? t1 : -1.f); + if (t <= 0.f) return -1.f; + + glm::vec3 pObj = ro + t * rd; + glm::vec3 nObj = glm::normalize(pObj); + + glm::vec3 pW = xformPoint(sphere.transform, pObj); + glm::vec3 nW = glm::normalize(xformVector(sphere.invTranspose, nObj)); + + outP = pW; + outN = nW; + outside = glm::dot(glm::normalize(r.direction), nW) < 0.f; + return glm::length(pW - r.origin); } + +__host__ __device__ float boxIntersectionTest( + Geom box, Ray r, glm::vec3& outP, glm::vec3& outN, bool& outside) +{ + // Ray -> object space + glm::vec3 ro = xformPoint(box.inverseTransform, r.origin); + glm::vec3 rd = glm::normalize(xformVector(box.inverseTransform, r.direction)); + + // slabs [-0.5, 0.5]^3 + glm::vec3 t1 = (-0.5f - ro) / rd; + glm::vec3 t2 = (0.5f - ro) / rd; + + glm::vec3 tminv = glm::min(t1, t2); + glm::vec3 tmaxv = glm::max(t1, t2); + float tmin = fmaxf(fmaxf(tminv.x, tminv.y), tminv.z); + float tmax = fminf(fminf(tmaxv.x, tmaxv.y), tmaxv.z); + + if (tmax < 0.f || tmin > tmax) return -1.f; + + float t = (tmin > 0.f) ? tmin : tmax; + glm::vec3 pObj = ro + t * rd; + + // compute normal in object space + glm::vec3 nObj(0); + glm::vec3 ap = glm::abs(pObj); + if (ap.x > ap.y && ap.x > ap.z) nObj = glm::vec3((pObj.x > 0.f) ? 1.f : -1.f, 0, 0); + else if (ap.y > ap.z) nObj = glm::vec3(0, (pObj.y > 0.f) ? 1.f : -1.f, 0); + else nObj = glm::vec3(0, 0, (pObj.z > 0.f) ? 1.f : -1.f); + + glm::vec3 pW = xformPoint(box.transform, pObj); + glm::vec3 nW = glm::normalize(xformVector(box.invTranspose, nObj)); + + outP = pW; + outN = nW; + outside = glm::dot(glm::normalize(r.direction), nW) < 0.f; + return glm::length(pW - r.origin); +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 330725251..115547477 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -59,6 +59,33 @@ GuiDataContainer* imguiData = NULL; ImGuiIO* io = nullptr; bool mouseOverImGuiWinow = false; +// Restartable rendering +static std::string checkpointPath; +static void saveCheckpoint() { + if (!renderState) return; + std::ofstream out(checkpointPath, std::ios::binary); + if (!out) { std::cerr << "Checkpoint save failed\n"; return; } + out.write((const char*)&width, sizeof(int)); + out.write((const char*)&height, sizeof(int)); + out.write((const char*)&iteration, sizeof(int)); + out.write((const char*)renderState->image.data(), sizeof(glm::vec3) * width * height); + std::cout << "Saved checkpoint: " << checkpointPath << " (" << iteration << " spp)\n"; +} +static void tryLoadCheckpoint() { + std::ifstream in(checkpointPath, std::ios::binary); + if (!in) return; + int w,h,iter; + in.read((char*)&w, sizeof(int)); + in.read((char*)&h, sizeof(int)); + in.read((char*)&iter, sizeof(int)); + if (w==width && h==height) { + renderState->image.resize(w*h); + in.read((char*)renderState->image.data(), sizeof(glm::vec3) * w * h); + iteration = iter; + std::cout << "Loaded checkpoint: " << checkpointPath << " (" << iteration << " spp)\n"; + } +} + // Forward declarations for window loop and interactivity void runCuda(); void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods); @@ -74,12 +101,14 @@ std::string currentTimeString() return std::string(buf); } -//------------------------------- -//----------SETUP STUFF---------- -//------------------------------- +// +// SETUP STUFF +// +// Initialize texture that will display the image void initTextures() -{ +{ + // Create texture for display glGenTextures(1, &displayImage); glBindTexture(GL_TEXTURE_2D, displayImage); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); @@ -87,6 +116,7 @@ void initTextures() glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL); } +// Initialize vertex array object void initVAO(void) { GLfloat vertices[] = { @@ -96,6 +126,7 @@ void initVAO(void) -1.0f, 1.0f, }; + // Texture coordinates are upside down because images are flipped in y GLfloat texcoords[] = { 1.0f, 1.0f, 0.0f, 1.0f, @@ -103,11 +134,14 @@ void initVAO(void) 1.0f, 0.0f }; + // Counter-clockwise ordering GLushort indices[] = { 0, 1, 3, 3, 1, 2 }; + // Create and bind the vertex array object GLuint vertexBufferObjID[3]; glGenBuffers(3, vertexBufferObjID); + // Set up vertex arrays glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]); glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW); glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0); @@ -122,13 +156,15 @@ void initVAO(void) glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW); } +// Create the shader program GLuint initShader() -{ +{ + // Create shader const char* attribLocations[] = { "Position", "Texcoords" }; GLuint program = glslUtility::createDefaultProgram(attribLocations, 2); GLint location; - //glUseProgram(program); + // Bind the texture to texture unit 0 if ((location = glGetUniformLocation(program, "u_image")) != -1) { glUniform1i(location, 0); @@ -139,18 +175,17 @@ GLuint initShader() void deletePBO(GLuint* pbo) { + // unregister this buffer object with CUDA if (pbo) { - // unregister this buffer object with CUDA cudaGLUnregisterBufferObject(*pbo); - glBindBuffer(GL_ARRAY_BUFFER, *pbo); glDeleteBuffers(1, pbo); - *pbo = (GLuint)NULL; } } +// Delete texture void deleteTexture(GLuint* tex) { glDeleteTextures(1, tex); @@ -159,47 +194,36 @@ void deleteTexture(GLuint* tex) void cleanupCuda() { - if (pbo) - { - deletePBO(&pbo); - } - if (displayImage) - { - deleteTexture(&displayImage); - } + if (pbo) { deletePBO(&pbo); } + if (displayImage) { deleteTexture(&displayImage); } } +// Initialize CUDA with OpenGL interop void initCuda() { cudaGLSetGLDevice(0); - - // Clean up on program exit atexit(cleanupCuda); } +// Create pixel buffer object for display void initPBO() { - // set up vertex data parameter int num_texels = width * height; int num_values = num_texels * 4; int size_tex_data = sizeof(GLubyte) * num_values; - - // Generate a buffer ID called a PBO (Pixel Buffer Object) glGenBuffers(1, &pbo); - - // Make this the current UNPACK buffer (OpenGL is state-based) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo); - - // Allocate data for the buffer. 4-channel 8-bit image glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY); cudaGLRegisterBufferObject(pbo); } +// GLFW callback functions void errorCallback(int error, const char* description) { fprintf(stderr, "%s\n", description); } +// Initialize OpenGL, CUDA, and GLFW bool init() { glfwSetErrorCallback(errorCallback); @@ -220,14 +244,12 @@ bool init() glfwSetCursorPosCallback(window, mousePositionCallback); glfwSetMouseButtonCallback(window, mouseButtonCallback); - // Set up GL context glewExperimental = GL_TRUE; if (glewInit() != GLEW_OK) { return false; } printf("Opengl Version:%s\n", glGetString(GL_VERSION)); - //Set up ImGui IMGUI_CHECKVERSION(); ImGui::CreateContext(); @@ -236,7 +258,6 @@ bool init() ImGui_ImplGlfw_InitForOpenGL(window, true); ImGui_ImplOpenGL3_Init("#version 120"); - // Initialize other stuff initVAO(); initTextures(); initCuda(); @@ -249,13 +270,9 @@ bool init() return true; } -void InitImguiData(GuiDataContainer* guiData) -{ - imguiData = guiData; -} - +// IMGUI STUFF +void InitImguiData(GuiDataContainer* guiData) { imguiData = guiData; } -// LOOK: Un-Comment to check ImGui Usage void RenderImGui() { mouseOverImGuiWinow = io->WantCaptureMouse; @@ -264,43 +281,22 @@ void RenderImGui() ImGui_ImplGlfw_NewFrame(); ImGui::NewFrame(); - bool show_demo_window = true; - bool show_another_window = false; - ImVec4 clear_color = ImVec4(0.45f, 0.55f, 0.60f, 1.00f); - static float f = 0.0f; - static int counter = 0; - - ImGui::Begin("Path Tracer Analytics"); // Create a window called "Hello, world!" and append into it. - - // LOOK: Un-Comment to check the output window and usage - //ImGui::Text("This is some useful text."); // Display some text (you can use a format strings too) - //ImGui::Checkbox("Demo Window", &show_demo_window); // Edit bools storing our window open/close state - //ImGui::Checkbox("Another Window", &show_another_window); - - //ImGui::SliderFloat("float", &f, 0.0f, 1.0f); // Edit 1 float using a slider from 0.0f to 1.0f - //ImGui::ColorEdit3("clear color", (float*)&clear_color); // Edit 3 floats representing a color - - //if (ImGui::Button("Button")) // Buttons return true when clicked (most widgets return true when edited/activated) - // counter++; - //ImGui::SameLine(); - //ImGui::Text("counter = %d", counter); + ImGui::Begin("Path Tracer Analytics"); ImGui::Text("Traced Depth %d", imguiData->TracedDepth); - ImGui::Text("Application average %.3f ms/frame (%.1f FPS)", 1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate); + ImGui::Text("Application average %.3f ms/frame (%.1f FPS)", + 1000.0f / ImGui::GetIO().Framerate, ImGui::GetIO().Framerate); ImGui::End(); - ImGui::Render(); ImGui_ImplOpenGL3_RenderDrawData(ImGui::GetDrawData()); - } -bool MouseOverImGuiWindow() -{ - return mouseOverImGuiWinow; -} +// Returns whether the mouse is currently over an ImGui window +bool MouseOverImGuiWindow() { return mouseOverImGuiWinow; } void mainLoop() { + // Main loop while (!glfwWindowShouldClose(window)) { glfwPollEvents(); @@ -314,13 +310,9 @@ void mainLoop() glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glClear(GL_COLOR_BUFFER_BIT); - // Binding GL_PIXEL_UNPACK_BUFFER back to default glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - // VAO, shader program, and texture already bound glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, 0); - // Render ImGui Stuff RenderImGui(); glfwSwapBuffers(window); @@ -334,10 +326,6 @@ void mainLoop() glfwTerminate(); } -//------------------------------- -//-------------MAIN-------------- -//------------------------------- - int main(int argc, char** argv) { startTimeString = currentTimeString(); @@ -350,13 +338,9 @@ int main(int argc, char** argv) const char* sceneFile = argv[1]; - // Load scene file scene = new Scene(sceneFile); - - //Create Instance for ImGUIData guiData = new GuiDataContainer(); - // Set up camera stuff from loaded path tracer settings iteration = 0; renderState = &scene->state; Camera& cam = renderState->camera; @@ -370,8 +354,6 @@ int main(int argc, char** argv) cameraPosition = cam.position; - // compute phi (horizontal) and theta (vertical) relative 3D axis - // so, (0 0 1) is forward, (0 1 0) is up glm::vec3 viewXZ = glm::vec3(view.x, 0.0f, view.z); glm::vec3 viewZY = glm::vec3(0.0f, view.y, view.z); phi = glm::acos(glm::dot(glm::normalize(viewXZ), glm::vec3(0, 0, -1))); @@ -379,23 +361,22 @@ int main(int argc, char** argv) ogLookAt = cam.lookAt; zoom = glm::length(cam.position - ogLookAt); - // Initialize CUDA and GL components - init(); + if (!init()) return 1; - // Initialize ImGui Data InitImguiData(guiData); InitDataContainer(guiData); - // GLFW main loop - mainLoop(); + // Checkpoint + checkpointPath = std::string(renderState->imageName) + ".ckpt"; + tryLoadCheckpoint(); + mainLoop(); return 0; } void saveImage() { float samples = iteration; - // output image file Image img(width, height); for (int x = 0; x < width; x++) @@ -413,9 +394,7 @@ void saveImage() ss << filename << "." << startTimeString << "." << samples << "samp"; filename = ss.str(); - // CHECKITOUT img.savePNG(filename); - //img.saveHDR(filename); // Save a Radiance HDR file } void runCuda() @@ -423,71 +402,76 @@ void runCuda() if (camchanged) { iteration = 0; + // Clear image so the next pathtraceInit starts from a blank buffer + std::fill(renderState->image.begin(), renderState->image.end(), glm::vec3(0)); + Camera& cam = renderState->camera; cameraPosition.x = zoom * sin(phi) * sin(theta); cameraPosition.y = zoom * cos(theta); cameraPosition.z = zoom * cos(phi) * sin(theta); + // Set camera coordinate system cam.view = -glm::normalize(cameraPosition); glm::vec3 v = cam.view; - glm::vec3 u = glm::vec3(0, 1, 0);//glm::normalize(cam.up); + glm::vec3 u = glm::vec3(0, 1, 0); glm::vec3 r = glm::cross(v, u); cam.up = glm::cross(r, v); cam.right = r; + // Set camera position cam.position = cameraPosition; cameraPosition += cam.lookAt; cam.position = cameraPosition; camchanged = false; } - // Map OpenGL buffer object for writing from CUDA on a single GPU - // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer - + // Map OpenGL buffer object for writing from CUDA if (iteration == 0) { pathtraceFree(); pathtraceInit(scene); } + // Only iterate if we haven't reached the max number of iterations if (iteration < renderState->iterations) { uchar4* pbo_dptr = NULL; iteration++; cudaGLMapBufferObject((void**)&pbo_dptr, pbo); - // execute the kernel int frame = 0; pathtrace(pbo_dptr, frame, iteration); - // unmap buffer object cudaGLUnmapBufferObject(pbo); } + + // Save image and clean up after the max number of iterations is reached else { saveImage(); + saveCheckpoint(); pathtraceFree(); cudaDeviceReset(); exit(EXIT_SUCCESS); } } -//------------------------------- -//------INTERACTIVITY SETUP------ -//------------------------------- - +// INPUT CALLBACKS void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) { if (action == GLFW_PRESS) - { + { + // Don't do anything if the user is typing in an ImGui window switch (key) { case GLFW_KEY_ESCAPE: saveImage(); + saveCheckpoint(); glfwSetWindowShouldClose(window, GL_TRUE); break; case GLFW_KEY_S: saveImage(); + saveCheckpoint(); break; case GLFW_KEY_SPACE: camchanged = true; @@ -499,39 +483,39 @@ void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods } } +// Mouse button callback void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) { - if (MouseOverImGuiWindow()) - { - return; - } + if (MouseOverImGuiWindow()) return; leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS); rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS); middleMousePressed = (button == GLFW_MOUSE_BUTTON_MIDDLE && action == GLFW_PRESS); } +// Mouse position callback void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) { - if (xpos == lastX || ypos == lastY) - { - return; // otherwise, clicking back into window causes re-start - } + if (xpos == lastX || ypos == lastY) return; + // Don't do anything if the user is typing in an ImGui window if (leftMousePressed) { - // compute new camera parameters phi -= (xpos - lastX) / width; theta -= (ypos - lastY) / height; theta = std::fmax(0.001f, std::fmin(theta, PI)); camchanged = true; } + + // Zoom in or out else if (rightMousePressed) { zoom += (ypos - lastY) / height; zoom = std::fmax(0.1f, zoom); camchanged = true; } + + // Pan the camera else if (middleMousePressed) { renderState = &scene->state; diff --git a/src/pathtrace.cu b/src/pathtrace.cu index 709c231ba..f92db27ca 100644 --- a/src/pathtrace.cu +++ b/src/pathtrace.cu @@ -1,413 +1,468 @@ #include "pathtrace.h" - -#include -#include -#include -#include -#include -#include - #include "sceneStructs.h" #include "scene.h" -#include "glm/glm.hpp" -#include "glm/gtx/norm.hpp" -#include "utilities.h" #include "intersections.h" #include "interactions.h" +#include "utilities.h" -#define ERRORCHECK 1 +#include +#include +#include -#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) -#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -void checkCUDAErrorFn(const char* msg, const char* file, int line) +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// Feature toggles & params (no struct changes) +#define ENABLE_DOF 1 // Depth of Field () +#define DOF_APERTURE 0.12f +#define DOF_FOCUS_DIST -1.f // < 0 -> auto: |lookAt-position| - + +#define ENABLE_CAMERA_MBLUR 1 // Camera position lerp +#define ENABLE_NEE 1 // Next-Event Estimation (direct lighting) +#define ENABLE_ACES_TONEMAP 1 // Post FX +#define ENABLE_SORT_BY_MATERIAL 1 // Group by BSDF to reduce divergence (wavefront-lite) + +// Globals +static int gW = 0, gH = 0; +static int gTraceDepth = 5; +static Geom* dev_geoms = nullptr; +static Material* dev_mats = nullptr; +static glm::vec3* dev_accum = nullptr; +static PathSegment* dev_paths = nullptr; +static ShadeableIntersection* dev_isects = nullptr; +static int* dev_matKeys = nullptr; // material sort keys + +static int hNumGeoms = 0, hNumMats = 0; + +// emissive object list (indices into geoms) +static int* dev_lightIdx = nullptr; +static int hNumLights = 0; + +static RenderState* hState = nullptr; +static glm::vec3* hImage = nullptr; +static GuiDataContainer* hGui = nullptr; + +// Utilities +__host__ __device__ inline unsigned int utilhash_local(unsigned int a) { -#if ERRORCHECK - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) - { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) - { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); -#ifdef _WIN32 - getchar(); -#endif // _WIN32 - exit(EXIT_FAILURE); -#endif // ERRORCHECK + // tiny integer hash; good enough for scrambling/QMC CP-rot + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } -__host__ __device__ -thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) -{ - int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index); - return thrust::default_random_engine(h); +__device__ inline thrust::default_random_engine makeRNG(int iter, int pixel, int depth) { + return thrust::default_random_engine(utilhash_local(iter * 9781u + pixel * 9176u + depth * 26699u)); } -//Kernel that writes the image to the OpenGL PBO directly. -__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, int iter, glm::vec3* image) -{ - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - - if (x < resolution.x && y < resolution.y) - { - int index = x + (y * resolution.x); - glm::vec3 pix = image[index]; - - glm::ivec3 color; - color.x = glm::clamp((int)(pix.x / iter * 255.0), 0, 255); - color.y = glm::clamp((int)(pix.y / iter * 255.0), 0, 255); - color.z = glm::clamp((int)(pix.z / iter * 255.0), 0, 255); - - // Each thread writes one pixel location in the texture (textel) - pbo[index].w = 0; - pbo[index].x = color.x; - pbo[index].y = color.y; - pbo[index].z = color.z; - } +// Radical inverse base-2 (Van der Corput) + Hammersley 2D +__device__ inline float radicalInverse_VdC(unsigned int bits) { + bits = __brev(bits); + return (float)bits * 2.3283064365386963e-10f; // / 2^32 } - -static Scene* hst_scene = NULL; -static GuiDataContainer* guiData = NULL; -static glm::vec3* dev_image = NULL; -static Geom* dev_geoms = NULL; -static Material* dev_materials = NULL; -static PathSegment* dev_paths = NULL; -static ShadeableIntersection* dev_intersections = NULL; -// TODO: static variables for device memory, any extra info you need, etc -// ... - -void InitDataContainer(GuiDataContainer* imGuiData) -{ - guiData = imGuiData; +__device__ inline glm::vec2 hammersley2D(unsigned int i, unsigned int N) { + return glm::vec2((float)i / (float)N, radicalInverse_VdC(i)); } -void pathtraceInit(Scene* scene) -{ - hst_scene = scene; - - const Camera& cam = hst_scene->state.camera; - const int pixelcount = cam.resolution.x * cam.resolution.y; - - cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3)); - cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3)); +__device__ inline float fractf(float x) { return x - floorf(x); } +__device__ inline glm::vec2 fract2(glm::vec2 v) { return glm::vec2(fractf(v.x), fractf(v.y)); } - cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment)); +// Per-pixel Cranley–Patterson rotation for LDS (scrambles but stays in [0,1)) +__device__ inline glm::vec2 cpRotation2D(int pixel) { + unsigned int h0 = utilhash_local((unsigned int)pixel * 0x9E3779B9u + 0x85ebca6bu); + unsigned int h1 = utilhash_local((unsigned int)pixel * 0x85ebca6bu + 0xc2b2ae35u); + return glm::vec2(radicalInverse_VdC(h0), radicalInverse_VdC(h1)); +} - cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom)); - cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice); +// Concentric disk (for thin lens) +__device__ inline void concentricSampleDisk(float u1, float u2, float& dx, float& dy) { + float sx = 2.f * u1 - 1.f; + float sy = 2.f * u2 - 1.f; + if (sx == 0 && sy == 0) { dx = dy = 0; return; } + float r, theta; + if (fabsf(sx) > fabsf(sy)) { r = sx; theta = (PI / 4.f) * (sy / sx); } + else { r = sy; theta = (PI / 2.f) - (PI / 4.f) * (sx / sy); } + dx = r * cosf(theta); + dy = r * sinf(theta); +} - cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material)); - cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice); +// ACES filmic (Narkowicz 2016 fit) +__device__ inline glm::vec3 ACESFilm(glm::vec3 x) { + const float a = 2.51f, b = 0.03f, c = 2.43f, d = 0.59f, e = 0.14f; + return glm::clamp((x * (a * x + b)) / (x * (c * x + d) + e), 0.f, 1.f); +} - cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection)); - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); +// Device helpers +__device__ inline void samplePointOnUnitSphere(float u1, float u2, glm::vec3& p, glm::vec3& n) { + float z = 1.f - 2.f * u1; + float r = sqrtf(fmaxf(0.f, 1.f - z * z)); + float phi = TWO_PI * u2; + p = 0.5f * glm::vec3(r * cosf(phi), r * sinf(phi), z); + n = glm::normalize(p * 2.f); +} - // TODO: initialize any extra device memeory you need +__device__ inline void samplePointOnUnitCube(float u1, float u2, glm::vec3& p, glm::vec3& n) { + int face = (int)floorf(u1 * 6.f); + float su = 2.f * (u2)-1.f; + + // simple second dim variation (not area-correct; OK for NEE point picking) + float sv = 2.f * (radicalInverse_VdC((unsigned)(u1 * 65535u) + 7u)) - 1.f; + switch (face) { + case 0: p = glm::vec3(0.5f, su * 0.5f, sv * 0.5f); n = glm::vec3(1, 0, 0); break; + case 1: p = glm::vec3(-0.5f, su * 0.5f, sv * 0.5f); n = glm::vec3(-1, 0, 0); break; + case 2: p = glm::vec3(su * 0.5f, 0.5f, sv * 0.5f); n = glm::vec3(0, 1, 0); break; + case 3: p = glm::vec3(su * 0.5f, -0.5f, sv * 0.5f); n = glm::vec3(0, -1, 0); break; + case 4: p = glm::vec3(su * 0.5f, sv * 0.5f, 0.5f); n = glm::vec3(0, 0, 1); break; + default:p = glm::vec3(su * 0.5f, sv * 0.5f, -0.5f); n = glm::vec3(0, 0, -1); break; + } +} - checkCUDAError("pathtraceInit"); +__device__ inline glm::vec3 multiplyMV_dev(const glm::mat4& m, const glm::vec4& v) { + return glm::vec3(m * v); +} +__device__ inline glm::vec3 xformPoint(const Geom& g, const glm::vec3& p) { + return multiplyMV_dev(g.transform, glm::vec4(p, 1.f)); +} +__device__ inline glm::vec3 xformNormal(const Geom& g, const glm::vec3& n) { + return glm::normalize(multiplyMV_dev(g.invTranspose, glm::vec4(n, 0.f))); } -void pathtraceFree() -{ - cudaFree(dev_image); // no-op if dev_image is null - cudaFree(dev_paths); - cudaFree(dev_geoms); - cudaFree(dev_materials); - cudaFree(dev_intersections); - // TODO: clean up any extra device memory you created - - checkCUDAError("pathtraceFree"); +__device__ inline bool occluded(const Ray& r, float tMax, const Geom* geoms, int nGeoms) { + glm::vec3 tmpP, tmpN; bool tmpOut = true; + for (int i = 0; i < nGeoms; ++i) { + float t = (geoms[i].type == SPHERE) + ? sphereIntersectionTest(geoms[i], r, tmpP, tmpN, tmpOut) + : boxIntersectionTest(geoms[i], r, tmpP, tmpN, tmpOut); + if (t > 0.f && t < tMax) return true; + } + return false; } -/** -* Generate PathSegments with rays from the camera through the screen into the -* scene, which is the first bounce of rays. -* -* Antialiasing - add rays for sub-pixel sampling -* motion blur - jitter rays "in time" -* lens effect - jitter ray origin positions based on a lens -*/ -__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments) +// Kernels +__global__ void kernGenerateCameraRays( + int w, int h, Camera cam, int iter, PathSegment* paths) { int x = (blockIdx.x * blockDim.x) + threadIdx.x; int y = (blockIdx.y * blockDim.y) + threadIdx.y; + if (x >= w || y >= h) return; + int idx = x + y * w; + + // Low-discrepancy jitter + CP rotation per pixel (better convergence). + glm::vec2 jitter = fract2(hammersley2D((unsigned)(iter - 1), 1024u) + cpRotation2D(idx)); + float u = (x + jitter.x) * cam.pixelLength.x - 1.f; + float v = (y + jitter.y) * cam.pixelLength.y - 1.f; + + // Camera shutter blur between two positions + glm::vec3 camPos = cam.position; + +#if ENABLE_CAMERA_MBLUR + thrust::default_random_engine rng = makeRNG(iter, idx, 0); + thrust::uniform_real_distribution u01(0.f, 1.f); + float t = u01(rng); + glm::vec3 p0 = cam.position + glm::vec3(-0.02f, 0.0f, 0.00f); + glm::vec3 p1 = cam.position + glm::vec3(0.02f, 0.0f, 0.00f); + camPos = (1.f - t) * p0 + t * p1; +#endif + + glm::vec3 dir = glm::normalize(u * cam.right + v * cam.up + cam.view); + glm::vec3 rayOrig = camPos; + +#if ENABLE_DOF + if (DOF_APERTURE > 0.f) { + float focusDist = (DOF_FOCUS_DIST > 0.f) + ? DOF_FOCUS_DIST + : glm::length(cam.lookAt - cam.position); + float lensRadius = 0.5f * DOF_APERTURE; + glm::vec2 lxi = fract2(hammersley2D((unsigned)(iter + 131), 1024u) + cpRotation2D(idx)); + float dx, dy; + concentricSampleDisk(lxi.x, lxi.y, dx, dy); + glm::vec3 pLens = camPos + dx * lensRadius * cam.right + dy * lensRadius * cam.up; + glm::vec3 pFocus = camPos + dir * focusDist; + dir = glm::normalize(pFocus - pLens); + rayOrig = pLens; + } +#endif + + PathSegment ps; + ps.ray.origin = rayOrig; + ps.ray.direction = dir; + ps.color = glm::vec3(1.f); + ps.pixelIndex = idx; + ps.remainingBounces = INT_MAX; // will clamp next + paths[idx] = ps; +} - if (x < cam.resolution.x && y < cam.resolution.y) { - int index = x + (y * cam.resolution.x); - PathSegment& segment = pathSegments[index]; +__global__ void kernClampDepth(PathSegment* paths, int N, int depth) { + int i = (blockIdx.x * blockDim.x) + threadIdx.x; + if (i >= N) return; + paths[i].remainingBounces = depth; +} - segment.ray.origin = cam.position; - segment.color = glm::vec3(1.0f, 1.0f, 1.0f); +__global__ void kernComputeIntersections( + int N, const PathSegment* paths, const Geom* geoms, int nGeoms, + ShadeableIntersection* isects) +{ + int i = (blockIdx.x * blockDim.x) + threadIdx.x; + if (i >= N) return; + + const Ray r = paths[i].ray; + float tMin = 1e20f; + int hitMat = -1; + glm::vec3 hitN(0), hitP(0); + bool outside = true; + + for (int g = 0; g < nGeoms; ++g) { + glm::vec3 p, n; bool out = true; + float t = (geoms[g].type == SPHERE) + ? sphereIntersectionTest(geoms[g], r, p, n, out) + : boxIntersectionTest(geoms[g], r, p, n, out); + if (t > 0.f && t < tMin) { + tMin = t; hitN = n; hitP = p; hitMat = geoms[g].materialid; outside = out; + } + } - // TODO: implement antialiasing by jittering the ray - segment.ray.direction = glm::normalize(cam.view - - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f) - - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f) - ); + ShadeableIntersection si; + si.t = (hitMat >= 0) ? tMin : -1.f; + si.surfaceNormal = hitN; + si.materialId = hitMat; + isects[i] = si; +} - segment.pixelIndex = index; - segment.remainingBounces = traceDepth; - } +__global__ void kernMakeMaterialKeys(int N, const ShadeableIntersection* isects, int* keys) { + int i = (blockIdx.x * blockDim.x) + threadIdx.x; + if (i >= N) return; + int id = isects[i].materialId; + // Misses (id < 0) go to the end to keep real hits tightly grouped + keys[i] = (id >= 0) ? id : INT_MAX; } -// TODO: -// computeIntersections handles generating ray intersections ONLY. -// Generating new rays is handled in your shader(s). -// Feel free to modify the code below. -__global__ void computeIntersections( - int depth, - int num_paths, - PathSegment* pathSegments, - Geom* geoms, - int geoms_size, - ShadeableIntersection* intersections) +__global__ void kernShadeScatterAndNEE( + int iter, int depth, int N, PathSegment* paths, const ShadeableIntersection* isects, + const Material* mats, const Geom* geoms, int nGeoms, + const int* lightIdx, int nLights, glm::vec3* accum) { - int path_index = blockIdx.x * blockDim.x + threadIdx.x; + int i = (blockIdx.x * blockDim.x) + threadIdx.x; + if (i >= N) return; - if (path_index < num_paths) - { - PathSegment pathSegment = pathSegments[path_index]; + PathSegment& ps = paths[i]; + if (ps.remainingBounces <= 0) return; - float t; - glm::vec3 intersect_point; - glm::vec3 normal; - float t_min = FLT_MAX; - int hit_geom_index = -1; - bool outside = true; + const ShadeableIntersection& si = isects[i]; + int pixel = ps.pixelIndex; - glm::vec3 tmp_intersect; - glm::vec3 tmp_normal; + thrust::default_random_engine rng = makeRNG(iter, pixel, depth); - // naive parse through global geoms + // Miss => background + if (si.t < 0.f) { ps.remainingBounces = 0; return; } - for (int i = 0; i < geoms_size; i++) - { - Geom& geom = geoms[i]; + const Material m = mats[si.materialId]; + glm::vec3 x = ps.ray.origin + glm::normalize(ps.ray.direction) * si.t; + glm::vec3 n = glm::normalize(si.surfaceNormal); - if (geom.type == CUBE) - { - t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - else if (geom.type == SPHERE) - { - t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - // TODO: add more intersection tests here... triangle? metaball? CSG? - - // Compute the minimum t from the intersection tests to determine what - // scene geometry object was hit first. - if (t > 0.0f && t_min > t) - { - t_min = t; - hit_geom_index = i; - intersect_point = tmp_intersect; - normal = tmp_normal; - } - } + // Hit an emissive: add light and terminate + if (m.emittance > 0.f) { + accum[pixel] += ps.color * (m.color * m.emittance); + ps.remainingBounces = 0; + return; + } - if (hit_geom_index == -1) - { - intersections[path_index].t = -1.0f; - } - else - { - // The ray hits something - intersections[path_index].t = t_min; - intersections[path_index].materialId = geoms[hit_geom_index].materialid; - intersections[path_index].surfaceNormal = normal; - } + // Russian roulette after a few bounces + if (depth >= 3) { + thrust::uniform_real_distribution u01(0.f, 1.f); + // survival prob based on brightness of the path throughput + float p = fminf(0.95f, fmaxf(0.05f, fmax(fmax(ps.color.r, ps.color.g), ps.color.b))); + if (u01(rng) > p) { ps.remainingBounces = 0; return; } + ps.color /= p; // keep estimator unbiased } -} -// LOOK: "fake" shader demonstrating what you might do with the info in -// a ShadeableIntersection, as well as how to use thrust's random number -// generator. Observe that since the thrust random number generator basically -// adds "noise" to the iteration, the image should start off noisy and get -// cleaner as more iterations are computed. -// -// Note that this shader does NOT do a BSDF evaluation! -// Your shaders should handle that - this can allow techniques such as -// bump mapping. -__global__ void shadeFakeMaterial( - int iter, - int num_paths, - ShadeableIntersection* shadeableIntersections, - PathSegment* pathSegments, - Material* materials) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_paths) - { - ShadeableIntersection intersection = shadeableIntersections[idx]; - if (intersection.t > 0.0f) // if the intersection exists... - { - // Set up the RNG - // LOOK: this is how you use thrust's RNG! Please look at - // makeSeededRandomEngine as well. - thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0); - thrust::uniform_real_distribution u01(0, 1); - - Material material = materials[intersection.materialId]; - glm::vec3 materialColor = material.color; - - // If the material indicates that the object was a light, "light" the ray - if (material.emittance > 0.0f) { - pathSegments[idx].color *= (materialColor * material.emittance); +#if ENABLE_NEE + if (nLights > 0) { + // Use hashed Van der Corput for (u1,u2) instead of RNG to reduce noise + float u1 = radicalInverse_VdC(utilhash_local(pixel * 1973u + depth * 9277u + iter * 26699u)); + float u2 = radicalInverse_VdC(utilhash_local(pixel * 1259u + depth * 7411u + iter * 31847u + 1u)); + + int Lpick = (int)floorf(u1 * (float)nLights); + Lpick = min(Lpick, nLights - 1); + const Geom L = geoms[lightIdx[Lpick]]; + const Material mL = mats[L.materialid]; + + glm::vec3 pL, nL; + if (L.type == SPHERE) samplePointOnUnitSphere(u1, u2, pL, nL); + else samplePointOnUnitCube(u1, u2, pL, nL); + pL = xformPoint(L, pL); + nL = xformNormal(L, nL); + + glm::vec3 wi = glm::normalize(pL - x); + float dist = glm::length(pL - x); + float cosNo = fmaxf(0.f, glm::dot(n, wi)); + float cosLi = fmaxf(0.f, glm::dot(nL, -wi)); + if (cosNo > 0.f && cosLi > 0.f) { + Ray shadow; shadow.origin = x + n * 1e-4f; shadow.direction = wi; + bool blocked = occluded(shadow, dist * 0.999f, geoms, nGeoms); + if (!blocked) { + glm::vec3 f = m.color * (1.f / PI); // Lambertian BRDF + glm::vec3 Le = mL.color * mL.emittance; + glm::vec3 contrib = Le * f * cosNo * (float)nLights; // balance heuristic (1/pSel) + accum[pixel] += ps.color * contrib; } - // Otherwise, do some pseudo-lighting computation. This is actually more - // like what you would expect from shading in a rasterizer like OpenGL. - // TODO: replace this! you should be able to start with basically a one-liner - else { - float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f)); - pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f; - pathSegments[idx].color *= u01(rng); // apply some noise because why not - } - // If there was no intersection, color the ray black. - // Lots of renderers use 4 channel color, RGBA, where A = alpha, often - // used for opacity, in which case they can indicate "no opacity". - // This can be useful for post-processing and image compositing. - } - else { - pathSegments[idx].color = glm::vec3(0.0f); } } +#endif + + // Scatter next direction (Lambert/mirror/dielectric) + scatterRay(ps, x, n, m, rng); } -// Add the current iteration's output to the overall image -__global__ void finalGather(int nPaths, glm::vec3* image, PathSegment* iterationPaths) +struct IsTerminated { + __host__ __device__ bool operator()(const PathSegment& p) const { return p.remainingBounces <= 0; } +}; + +__global__ void kernToPBO(uchar4* pbo, int w, int h, const glm::vec3* accum, int iter) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + if (x >= w || y >= h) return; + int idx = x + y * w; + + glm::vec3 c = accum[idx] / fmaxf(1.f, (float)iter); +#if ENABLE_ACES_TONEMAP + c = ACESFilm(c); // ACES filmic fit +#endif + c = glm::pow(glm::clamp(c, glm::vec3(0), glm::vec3(1)), glm::vec3(1.f / 2.2f)); + pbo[idx] = make_uchar4( + (unsigned char)(255.f * c.r), + (unsigned char)(255.f * c.g), + (unsigned char)(255.f * c.b), 255); +} - if (index < nPaths) - { - PathSegment iterationPath = iterationPaths[index]; - image[iterationPath.pixelIndex] += iterationPath.color; +// Host API +void InitDataContainer(GuiDataContainer* guiData) { hGui = guiData; } + +void pathtraceInit(Scene* scene) +{ + hState = &scene->state; + hImage = scene->state.image.data(); + + const Camera& cam = hState->camera; + gW = cam.resolution.x; + gH = cam.resolution.y; + gTraceDepth = hState->traceDepth; + + hNumGeoms = (int)scene->geoms.size(); + hNumMats = (int)scene->materials.size(); + + const int N = gW * gH; + + cudaMalloc(&dev_geoms, sizeof(Geom) * hNumGeoms); + cudaMalloc(&dev_mats, sizeof(Material) * hNumMats); + cudaMalloc(&dev_accum, sizeof(glm::vec3) * N); + cudaMalloc(&dev_paths, sizeof(PathSegment) * N); + cudaMalloc(&dev_isects, sizeof(ShadeableIntersection) * N); + cudaMalloc(&dev_matKeys, sizeof(int) * N); + + cudaMemcpy(dev_geoms, scene->geoms.data(), sizeof(Geom) * hNumGeoms, cudaMemcpyHostToDevice); + cudaMemcpy(dev_mats, scene->materials.data(), sizeof(Material) * hNumMats, cudaMemcpyHostToDevice); + + // Collect emissive geometry indices (host side) + std::vector lightIdx; + for (int i = 0; i < (int)scene->geoms.size(); ++i) { + const Material& m = scene->materials[scene->geoms[i].materialid]; + if (m.emittance > 0.f) lightIdx.push_back(i); + } + hNumLights = (int)lightIdx.size(); + if (hNumLights > 0) { + cudaMalloc(&dev_lightIdx, sizeof(int) * hNumLights); + cudaMemcpy(dev_lightIdx, lightIdx.data(), sizeof(int) * hNumLights, cudaMemcpyHostToDevice); + } + else { + dev_lightIdx = nullptr; } + + // Start from whatever image host already has (for checkpoint resume) + cudaMemcpy(dev_accum, hImage, sizeof(glm::vec3) * N, cudaMemcpyHostToDevice); +} + +void pathtraceFree() +{ + if (dev_geoms) cudaFree(dev_geoms), dev_geoms = nullptr; + if (dev_mats) cudaFree(dev_mats), dev_mats = nullptr; + if (dev_accum) cudaFree(dev_accum), dev_accum = nullptr; + if (dev_paths) cudaFree(dev_paths), dev_paths = nullptr; + if (dev_isects) cudaFree(dev_isects), dev_isects = nullptr; + if (dev_matKeys) cudaFree(dev_matKeys), dev_matKeys = nullptr; + if (dev_lightIdx) cudaFree(dev_lightIdx), dev_lightIdx = nullptr; + + hState = nullptr; + hImage = nullptr; } -/** - * Wrapper for the __global__ call that sets up the kernel calls and does a ton - * of memory management - */ -void pathtrace(uchar4* pbo, int frame, int iter) +void pathtrace(uchar4* pbo, int /*frame*/, int iteration) { - const int traceDepth = hst_scene->state.traceDepth; - const Camera& cam = hst_scene->state.camera; - const int pixelcount = cam.resolution.x * cam.resolution.y; - - // 2D block for generating ray from camera - const dim3 blockSize2d(8, 8); - const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); - - // 1D block for path tracing - const int blockSize1d = 128; - - /////////////////////////////////////////////////////////////////////////// - - // Recap: - // * Initialize array of path rays (using rays that come out of the camera) - // * You can pass the Camera object to that kernel. - // * Each path ray must carry at minimum a (ray, color) pair, - // * where color starts as the multiplicative identity, white = (1, 1, 1). - // * This has already been done for you. - // * For each depth: - // * Compute an intersection in the scene for each path ray. - // A very naive version of this has been implemented for you, but feel - // free to add more primitives and/or a better algorithm. - // Currently, intersection distance is recorded as a parametric distance, - // t, or a "distance along the ray." t = -1.0 indicates no intersection. - // * Color is attenuated (multiplied) by reflections off of any object - // * TODO: Stream compact away all of the terminated paths. - // You may use either your implementation or `thrust::remove_if` or its - // cousins. - // * Note that you can't really use a 2D kernel launch any more - switch - // to 1D. - // * TODO: Shade the rays that intersected something or didn't bottom out. - // That is, color the ray by performing a color computation according - // to the shader, then generate a new ray to continue the ray path. - // We recommend just updating the ray's PathSegment in place. - // Note that this step may come before or after stream compaction, - // since some shaders you write may also cause a path to terminate. - // * Finally, add this iteration's results to the image. This has been done - // for you. - - // TODO: perform one iteration of path tracing - - generateRayFromCamera<<>>(cam, iter, traceDepth, dev_paths); - checkCUDAError("generate camera ray"); + const Camera cam = hState->camera; + const int W = gW, H = gH, N = W * H; + dim3 block2D(8, 8); + dim3 grid2D((W + block2D.x - 1) / block2D.x, + (H + block2D.y - 1) / block2D.y); + dim3 block1D(256); + dim3 grid1D((N + block1D.x - 1) / block1D.x); + + // Generate primary rays (AA, DoF, camera MB) + kernGenerateCameraRays << > > (W, H, cam, iteration, dev_paths); + kernClampDepth << > > (dev_paths, N, gTraceDepth); + + // Trace wave (simple megakernel loop) + int nPaths = N; int depth = 0; - PathSegment* dev_path_end = dev_paths + pixelcount; - int num_paths = dev_path_end - dev_paths; - - // --- PathSegment Tracing Stage --- - // Shoot ray into scene, bounce between objects, push shading chunks - - bool iterationComplete = false; - while (!iterationComplete) - { - // clean shading chunks - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); - - // tracing - dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; - computeIntersections<<>> ( - depth, - num_paths, - dev_paths, - dev_geoms, - hst_scene->geoms.size(), - dev_intersections - ); - checkCUDAError("trace one bounce"); - cudaDeviceSynchronize(); - depth++; + while (nPaths > 0 && depth < gTraceDepth) { + int blocks = (nPaths + block1D.x - 1) / block1D.x; - // TODO: - // --- Shading Stage --- - // Shade path segments based on intersections and generate new rays by - // evaluating the BSDF. - // Start off with just a big kernel that handles all the different - // materials you have in the scenefile. - // TODO: compare between directly shading the path segments and shading - // path segments that have been reshuffled to be contiguous in memory. - - shadeFakeMaterial<<>>( - iter, - num_paths, - dev_intersections, - dev_paths, - dev_materials - ); - iterationComplete = true; // TODO: should be based off stream compaction results. - - if (guiData != NULL) - { - guiData->TracedDepth = depth; - } - } + kernComputeIntersections << > > (nPaths, dev_paths, dev_geoms, hNumGeoms, dev_isects); - // Assemble this iteration and apply it to the image - dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d; - finalGather<<>>(num_paths, dev_image, dev_paths); +#if ENABLE_SORT_BY_MATERIAL + // Sort by material to reduce divergence in shading + kernMakeMaterialKeys << > > (nPaths, dev_isects, dev_matKeys); - /////////////////////////////////////////////////////////////////////////// + thrust::device_ptr kBeg(dev_matKeys); + thrust::device_ptr kEnd = kBeg + nPaths; + auto zBeg = thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_pointer_cast(dev_isects), + thrust::device_pointer_cast(dev_paths))); + auto zEnd = zBeg + nPaths; - // Send results to OpenGL buffer for rendering - sendImageToPBO<<>>(pbo, cam.resolution, iter, dev_image); + thrust::sort_by_key(kBeg, kEnd, zBeg); +#endif + + kernShadeScatterAndNEE << > > ( + iteration, depth, nPaths, dev_paths, dev_isects, + dev_mats, dev_geoms, hNumGeoms, + dev_lightIdx, hNumLights, dev_accum); + + // Stream compact dead paths (Thrust) + thrust::device_ptr pBeg(dev_paths); + thrust::device_ptr pEnd = pBeg + nPaths; + pEnd = thrust::remove_if(pBeg, pEnd, IsTerminated()); + nPaths = (int)(pEnd - pBeg); + + depth++; + if (hGui) hGui->TracedDepth = depth; + } - // Retrieve image from GPU - cudaMemcpy(hst_scene->state.image.data(), dev_image, - pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); + // Display + kernToPBO << > > (pbo, W, H, dev_accum, iteration); - checkCUDAError("pathtrace"); + // Keep accumulator updated on host (for saves/checkpoints) + cudaMemcpy(hImage, dev_accum, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost); } diff --git a/src/scene.cpp b/src/scene.cpp index 07ef304f8..2ad308fff 100644 --- a/src/scene.cpp +++ b/src/scene.cpp @@ -14,6 +14,8 @@ using namespace std; using json = nlohmann::json; + +// scene loading Scene::Scene(string filename) { cout << "Reading scene from " << filename << " ..." << endl; @@ -31,25 +33,33 @@ Scene::Scene(string filename) } } +// Load scene description from a JSON file void Scene::loadFromJSON(const std::string& jsonName) { std::ifstream f(jsonName); json data = json::parse(f); const auto& materialsData = data["Materials"]; std::unordered_map MatNameToID; + + //load in materials for (const auto& item : materialsData.items()) { const auto& name = item.key(); const auto& p = item.value(); Material newMaterial{}; - // TODO: handle materials loading differently - if (p["TYPE"] == "Diffuse") - { + const std::string type = p["TYPE"]; + + const auto& col = p["RGB"]; + newMaterial.color = glm::vec3(col[0], col[1], col[2]); + + //default values + if (p["TYPE"] == "Diffuse") { const auto& col = p["RGB"]; newMaterial.color = glm::vec3(col[0], col[1], col[2]); } - else if (p["TYPE"] == "Emitting") - { + + //emissive materials + else if (p["TYPE"] == "Emitting") { const auto& col = p["RGB"]; newMaterial.color = glm::vec3(col[0], col[1], col[2]); newMaterial.emittance = p["EMITTANCE"]; @@ -58,8 +68,20 @@ void Scene::loadFromJSON(const std::string& jsonName) { const auto& col = p["RGB"]; newMaterial.color = glm::vec3(col[0], col[1], col[2]); + newMaterial.hasReflective = 1.f; + if (p.contains("IOR")) { + newMaterial.indexOfRefraction = (float)p["IOR"]; + newMaterial.hasRefractive = 1.f; + } } - MatNameToID[name] = materials.size(); + else if (p["TYPE"] == "Refractive") { + const auto& col = p["RGB"]; + newMaterial.color = glm::vec3(col[0], col[1], col[2]); + newMaterial.hasRefractive = 1.0f; + newMaterial.indexOfRefraction = p.contains("IOR") ? (float)p["IOR"] : 1.5f; + } + + MatNameToID[name] = (uint32_t)materials.size(); materials.emplace_back(newMaterial); } const auto& objectsData = data["Objects"]; @@ -67,14 +89,9 @@ void Scene::loadFromJSON(const std::string& jsonName) { const auto& type = p["TYPE"]; Geom newGeom; - if (type == "cube") - { - newGeom.type = CUBE; - } - else - { - newGeom.type = SPHERE; - } + if (type == "cube") newGeom.type = CUBE; + else newGeom.type = SPHERE; + newGeom.materialid = MatNameToID[p["MATERIAL"]]; const auto& trans = p["TRANS"]; const auto& rotat = p["ROTAT"]; @@ -121,4 +138,4 @@ void Scene::loadFromJSON(const std::string& jsonName) int arraylen = camera.resolution.x * camera.resolution.y; state.image.resize(arraylen); std::fill(state.image.begin(), state.image.end(), glm::vec3()); -} +} \ No newline at end of file diff --git a/src/utilities.cpp b/src/utilities.cpp index 140814790..03c1192d6 100644 --- a/src/utilities.cpp +++ b/src/utilities.cpp @@ -105,12 +105,6 @@ std::istream& utilityCore::safeGetline(std::istream& is, std::string& t) { t.clear(); - // The characters in the stream are read one-by-one using a std::streambuf. - // That is faster than reading them one-by-one using the std::istream. - // Code that uses streambuf this way must be guarded by a sentry object. - // The sentry object performs various tasks, - // such as thread synchronization and updating the stream state. - std::istream::sentry se(is, true); std::streambuf* sb = is.rdbuf(); @@ -128,7 +122,6 @@ std::istream& utilityCore::safeGetline(std::istream& is, std::string& t) } return is; case EOF: - // Also handle the case when the last line has no line ending if (t.empty()) { is.setstate(std::ios::eofbit); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 0f9147d0c..9ff58cf03 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,7 +1,17 @@ set(headers - ) + "common.h" + "cpu.h" + "naive.h" + "efficient.h" + "thrust.h" + ) set(sources + "common.cu" + "cpu.cu" + "naive.cu" + "efficient.cu" + "thrust.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu new file mode 100644 index 000000000..0ce27a17d --- /dev/null +++ b/stream_compaction/common.cu @@ -0,0 +1,52 @@ +#include "common.h" + +void checkCUDAErrorFn(const char *msg, const char *file, int line) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); +} + + +namespace StreamCompaction { + namespace Common { + + /** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ + __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { + // TODO + + int idx = threadIdx.x + (blockIdx.x * blockDim.x); // map from threadIdx/blockIdx to element idx + if (idx >= n) return; // if idx is out of bounds, return + bools[idx] = (idata[idx] != 0) ? 1 : 0; // map to 1 if idata[idx] is non-zero, else map to 0 + + } + + /** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + // TODO + + int idx = threadIdx.x + (blockIdx.x * blockDim.x); // map from threadIdx/blockIdx to element idx + if (idx >= n) return; // if idx is out of bounds, return + + // if bools[idx] is 1, copy idata[idx] to odata[indices[idx]] + if (bools[idx] == 1) { + odata[indices[idx]] = idata[idx]; // scatter + } + } + + } +} diff --git a/stream_compaction/common.h b/stream_compaction/common.h new file mode 100644 index 000000000..d2c1fed99 --- /dev/null +++ b/stream_compaction/common.h @@ -0,0 +1,132 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) + +/** + * Check for CUDA errors; print and exit if there was a problem. + */ +void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); + +inline int ilog2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return x == 1 ? 0 : ilog2(x - 1) + 1; +} + +namespace StreamCompaction { + namespace Common { + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; + }; + } +} diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu new file mode 100644 index 000000000..67caeed5e --- /dev/null +++ b/stream_compaction/cpu.cu @@ -0,0 +1,120 @@ +#include +#include "cpu.h" + +#include "common.h" + +namespace StreamCompaction { + namespace CPU { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + /** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. + */ + void scan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + + // exclusive scan + if (n <= 0) { + + // Empty input, nothing to do + timer().endCpuTimer(); + return; + } + + odata[0] = 0; // first element is always 0 for exclusive scan + + // compute the rest of the elements + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; // exclusive scan + } + + timer().endCpuTimer(); + } + + /** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithoutScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + + int count = 0; // number of non-zero elements + + // iterate through input array + for (int i = 0; i < n; ++i) { + + if (idata[i] != 0) { // keep only non-zero elements + odata[count] = idata[i]; // write to output array + count++; // increment count + } + } + + timer().endCpuTimer(); + + return count; // return number of non-zero elements + //return -1; + } + + + + /** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ + int compactWithScan(int n, int *odata, const int *idata) { + timer().startCpuTimer(); + // TODO + + // Handle edge case of empty input + if (n <= 0) { + timer().endCpuTimer(); + return 0; + } + + // Map to boolean array (1 for non-zero, 0 for zero) + int* bools = new int[n]; + for (int i = 0; i < n; ++i) { + bools[i] = (idata[i] != 0) ? 1 : 0; + } + + // Exclusive prefix sum (scan) on the boolean array + int* indices = new int[n]; // to hold the scanned indices + indices[0] = 0; // first element is always 0 for exclusive scan + + // Compute the rest of the elements + for (int i = 1; i < n; ++i) { + indices[i] = indices[i - 1] + bools[i - 1]; // exclusive scan + } + + // Compute total count of non-zero elements + int count = indices[n - 1] + bools[n - 1]; + + // Scatter - Write all non-zero elements to odata at computed indices + for (int i = 0; i < n; ++i) { + + // If the element is non-zero, write it to the output array at the scanned index + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; // scatter + } + } + + delete[] bools; // free temporary boolean array + delete[] indices; // free temporary indices array + + timer().endCpuTimer(); + return count; // return number of non-zero elements + //return -1; + } + } +} diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h new file mode 100644 index 000000000..873c04769 --- /dev/null +++ b/stream_compaction/cpu.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace CPU { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + + int compactWithoutScan(int n, int *odata, const int *idata); + + int compactWithScan(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu new file mode 100644 index 000000000..23703d4bc --- /dev/null +++ b/stream_compaction/efficient.cu @@ -0,0 +1,531 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + // __global__ kernels and device code here + + // Up-Sweep (reduce) kernel + __global__ void kernUpSweep(int n, int step, int* data) { + int index = (threadIdx.x + blockIdx.x * blockDim.x) * step + (step - 1); // get the index of the last element in the current segment + + // make sure we don't go out of bounds + if (index < n) { + data[index] += data[index - step / 2]; // add the value of the left child to the parent + } + } + + // Down-Sweep kernel + __global__ void kernDownSweep(int n, int step, int* data) { + + int index = (threadIdx.x + blockIdx.x * blockDim.x) * step + (step - 1); // get the index of the last element in the current segment + + // make sure we don't go out of bounds + if (index < n) { + int left = index - step / 2; // get the index of the left child + int t = data[left]; // store the left child's value + data[left] = data[index]; // set the left child's value to the parent's value + data[index] += t; // set the parent's value to the sum of the left child's value and the parent's value + } + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + // TODO + // Edge case: if n <= 0, return + if (n <= 0) { + return; + } + + // Allocate space rounded up to next power of two + int pow2 = 1 << ilog2ceil(n); // next power of 2 + int* dev_data = nullptr; // device array + cudaMalloc(&dev_data, pow2 * sizeof(int)); // allocate device memory + checkCUDAError("cudaMalloc dev_data for efficient scan"); // check for errors + + // Copy input data to device and pad the rest with 0s + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data to device + + // Pad the rest with 0s + if (pow2 > n) { + cudaMemset(dev_data + n, 0, (pow2 - n) * sizeof(int)); // set the rest to 0 + } + checkCUDAError("cudaMemcpy H2D and padding for efficient scan"); // check for errors + + timer().startGpuTimer(); + // TODO + + const int blockSize = 128; //128 // number of threads per block + + // Up-sweep (reduce) phase: build sum in place + for (int step = 2; step <= pow2; step *= 2) { + int threads = pow2 / step; // number of add operations at this level + int fullBlocks = (threads + blockSize - 1) / blockSize; // number of blocks needed + kernUpSweep << > > (pow2, step, dev_data); // launch kernel + checkCUDAError("kernUpSweep kernel"); // check for errors + } + + // Set the last element (total sum) to 0 for exclusive scan + cudaMemset(dev_data + (pow2 - 1), 0, sizeof(int)); // set the last element to 0 + checkCUDAError("cudaMemset root (exclusive scan)"); // check for errors + + // Down-sweep phase: distribute the prefix sums + for (int step = pow2; step >= 2; step /= 2) { + + // launch kernel + int threads = pow2 / step; // number of add operations at this level + int fullBlocks = (threads + blockSize - 1) / blockSize; // number of blocks needed + kernDownSweep << > > (pow2, step, dev_data); // launch kernel + checkCUDAError("kernDownSweep kernel"); // check for errors + } + + timer().endGpuTimer(); + + // Read back the scanned result (first n elements) + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); // copy result back to host + checkCUDAError("cudaMemcpy D2H for efficient scan result"); // check for errors + + cudaFree(dev_data); // free device memory + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + timer().startGpuTimer(); + // TODO + + // Edge case: if n <= 0, return 0 + if (n <= 0) { + timer().endGpuTimer(); + return 0; + } + + // Device memory allocation + int* dev_idata = nullptr; // device input array + int* dev_bools = nullptr; // device boolean array + int* dev_indices = nullptr; // device indices array + int* dev_odata = nullptr; // device output array + + cudaMalloc(&dev_idata, n * sizeof(int)); // allocate device memory for input + cudaMalloc(&dev_bools, n * sizeof(int)); // allocate device memory for boolean array + cudaMalloc(&dev_odata, n * sizeof(int)); // allocate device memory for output + checkCUDAError("cudaMalloc failed for compaction arrays"); + + // We will allocate dev_indices with padding for scan + int pow2 = 1 << ilog2ceil(n); // next power of 2 + cudaMalloc(&dev_indices, pow2 * sizeof(int)); // allocate device memory for indices with padding + checkCUDAError("cudaMalloc failed for indices array"); // check for errors + + // Copy input to device + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data to device + checkCUDAError("cudaMemcpy H2D for compaction input"); // check for errors + + // Map input to booleans (1 = keep, 0 = discard) + int blockSize = 128; // number of threads per block + int fullBlocks = (n + blockSize - 1) / blockSize; // number of blocks needed + StreamCompaction::Common::kernMapToBoolean <<>> (n, dev_bools, dev_idata); // launch kernel + checkCUDAError("kernMapToBoolean kernel"); // check for errors + + // Scan on dev_bools -> dev_indices (inclusive of padding) + // Copy bools to indices array (and pad remaining space with 0) + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bools to indices + + if (pow2 > n) { + cudaMemset(dev_indices + n, 0, (pow2 - n) * sizeof(int)); // pad remaining space with 0 + } + checkCUDAError("copy + pad boolean array for scan"); + + // Up-sweep phase on indices array + for (int step = 2; step <= pow2; step *= 2) { + + int threads = pow2 / step; // number of add operations at this level + fullBlocks = (threads + blockSize - 1) / blockSize; // number of blocks needed + kernUpSweep <<>> (pow2, step, dev_indices); // launch kernel + checkCUDAError("kernUpSweep (compaction) kernel"); + } + + // Set last element to 0 (prepare for exclusive scan) + cudaMemset(dev_indices + (pow2 - 1), 0, sizeof(int)); // set last element to 0 + checkCUDAError("cudaMemset root for compaction scan"); + + // Down-sweep phase + for (int step = pow2; step >= 2; step /= 2) { + + int threads = pow2 / step; // number of add operations at this level + fullBlocks = (threads + blockSize - 1) / blockSize; // number of blocks needed + kernDownSweep <<>> (pow2, step, dev_indices); // launch kernel + checkCUDAError("kernDownSweep (compaction) kernel"); + } + + // Scatter non-zero elements to output array using computed indices + fullBlocks = (n + blockSize - 1) / blockSize; // number of blocks needed + StreamCompaction::Common::kernScatter <<>> ( + n, dev_odata, dev_idata, dev_bools, dev_indices); // launch kernel + checkCUDAError("kernScatter kernel"); + + + timer().endGpuTimer(); + + // Copy compacted data back to host + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy D2H for compaction output"); + + // Compute and return count of non-zero elements + int count = 0; + int lastBool, lastIndex; + cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy D2H for compaction count"); + if (n > 0) { + count = lastIndex + lastBool; + } + + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + return count; + } + } + + /////////////////////////////////////////////////////////////////////////////// + //Extra Credit + + + // Radix Sort + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; + } + + // Kernel to flip the sign bit of each integer (convert to "unsigned" key by XOR with 0x80000000) + __global__ void kernFlipBits(int n, int* data) { + int i = threadIdx.x + blockIdx.x * blockDim.x; // global index + if (i < n) { + data[i] ^= 0x80000000; // Flip MSB to handle signed values + } + } + + // Kernel to map each element's specific bit to a boolean (0 or 1). + __global__ void kernMapToBit(int n, int bit, int* bools, const int* idata) { + int i = threadIdx.x + blockIdx.x * blockDim.x; // global index + if (i < n) { + int mask = 1 << bit; + // If the bit is set, mark 1; otherwise 0 + bools[i] = (idata[i] & mask) ? 1 : 0; + } + } + + // Kernel to scatter elements into output array based on computed indices for radix sort. + __global__ void kernScatterRadix(int n, const int* idata, int* odata, + const int* bools, const int* indices, int totalFalse) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < n) { + if (bools[i] == 0) { + // Element has 0 in current bit, write to index = i - number of 1s before i + int pos = i - indices[i]; + odata[pos] = idata[i]; + } + else { + // Element has 1 in current bit, write to index = totalFalse + (number of 1s before i) + int pos = totalFalse + indices[i]; + odata[pos] = idata[i]; + } + } + } + + /** + * Performs radix sort on idata, storing the sorted result into odata. + * Handles signed integers by using a bit flip (XOR 0x80000000) so that negative numbers are sorted correctly. + */ + void sort(int n, int* odata, const int* idata) { + if (n <= 0) { + return; + } + // Device memory allocation + int* dev_in = nullptr; // input array + int* dev_out = nullptr; // output array + int* dev_bools = nullptr; // boolean array for current bit + int* dev_indices = nullptr; // scanned indices array + + cudaMalloc(&dev_in, n * sizeof(int)); // allocate device memory for input + cudaMalloc(&dev_out, n * sizeof(int)); // allocate device memory for output + cudaMalloc(&dev_bools, n * sizeof(int)); // allocate device memory for boolean array + + int pow2Length = 1 << ilog2ceil(n); // next power of 2 for scan + cudaMalloc(&dev_indices, pow2Length * sizeof(int)); // allocate device memory for indices with padding + checkCUDAError("cudaMalloc failed for radix sort arrays"); + + // Copy input data to device + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy H2D for radix sort input"); + + // Transform all numbers by flipping MSB (to handle signed ints) + const int blockSize = 128; + int fullBlocks = (n + blockSize - 1) / blockSize; // number of blocks needed + kernFlipBits <<>> (n, dev_in); // flip sign bits + checkCUDAError("kernFlipBits kernel"); + + timer().startGpuTimer(); + // Loop over each bit (0 to 31) + for (int bit = 0; bit < 32; ++bit) { + + // Map each element to a boolean (is bit == 1?) + fullBlocks = (n + blockSize - 1) / blockSize; + kernMapToBit <<> > (n, bit, dev_bools, dev_in); // map to booleans + checkCUDAError("kernMapToBit kernel"); + + // Copy boolean array to dev_indices (pad to next power of two length) + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bools to indices + if (pow2Length > n) { + cudaMemset(dev_indices + n, 0, (pow2Length - n) * sizeof(int)); + } + checkCUDAError("cudaMemcpy + cudaMemset for boolean array"); + + // Inclusive scan (prefix sum) on dev_indices (booleans array) using Blelloch scan (upsweep and downsweep) + // Upsweep phase + for (int stride = 2; stride <= pow2Length; stride *= 2) { + int threads = pow2Length / stride; + int blocks = (threads + blockSize - 1) / blockSize; + StreamCompaction::Efficient::kernUpSweep <<>> (pow2Length, stride, dev_indices); + checkCUDAError("kernUpSweep (radix) kernel"); + } + // Set last element to 0 (prepare for exclusive scan) + cudaMemset(dev_indices + (pow2Length - 1), 0, sizeof(int)); + checkCUDAError("cudaMemset root for radix scan"); + // Downsweep phase + for (int stride = pow2Length; stride >= 2; stride /= 2) { + int threads = pow2Length / stride; + int blocks = (threads + blockSize - 1) / blockSize; + StreamCompaction::Efficient::kernDownSweep << > > (pow2Length, stride, dev_indices); + checkCUDAError("kernDownSweep (radix) kernel"); + } + + // Calculate total number of 0s (false) for this bit to determine scatter offsets + int lastBool, lastIndex; + cudaMemcpy(&lastBool, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy D2H for radix scan totals"); + int totalOnes = lastIndex + lastBool; + int totalZeros = n - totalOnes; + + // Scatter: stable partition into dev_out based on bit + fullBlocks = (n + blockSize - 1) / blockSize; + kernScatterRadix <<>> (n, dev_in, dev_out, dev_bools, dev_indices, totalZeros); + checkCUDAError("kernScatterRadix kernel"); + + // Swap dev_in and dev_out for next iteration + int* temp = dev_in; + dev_in = dev_out; + dev_out = temp; + } + timer().endGpuTimer(); + + // After 32 iterations, dev_in now holds the fully sorted values (with MSB flipped). + // Flip bits back to restore original sign bits + fullBlocks = (n + blockSize - 1) / blockSize; + kernFlipBits <<>> (n, dev_in); + checkCUDAError("kernFlipBits kernel (restore)"); + + // Copy sorted data back to host + cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy D2H for radix sort output"); + + // Free allocated memory + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(dev_bools); + cudaFree(dev_indices); + } + } + + /////////////////////////////////////////////////////////////////////////////// + namespace Shared { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() { + static PerformanceTimer timer; + return timer; + } + + // Macro for conflict-free indexing offset (to avoid shared memory bank conflicts) + #define LOG_NUM_BANKS 5 // 32 banks + #define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS) + + // Kernel to perform scan (exclusive prefix sum) on each block using shared memory (Blelloch algorithm). + // Also computes the sum of each block's elements and stores it in blockSums[blockIdx.x]. + __global__ void kernScanBlock(int n, const int* idata, int* odata, int* blockSums) { + extern __shared__ int temp[]; // shared memory array for scanning (with padding for bank conflicts) + int index = threadIdx.x; + int globalIndex = blockIdx.x * blockDim.x + index; + int blockSize = blockDim.x; + // Load data into shared memory (with conflict-free padding) + if (globalIndex < n) { + temp[index + CONFLICT_FREE_OFFSET(index)] = idata[globalIndex]; + } + else { + temp[index + CONFLICT_FREE_OFFSET(index)] = 0; + } + __syncthreads(); + + int offset = 1; + int paddedLength = blockSize; // we treat each block segment length as blockSize (pad with zeros if needed) + // Up-sweep (reduce) phase + for (offset = 1; offset < paddedLength; offset *= 2) { + int idx = (index + 1) * offset * 2 - 1; + if (idx < paddedLength) { + int ai = idx - offset; + int bi = idx; + // Add value from ai to bi with conflict-free indexing + int offAi = ai + CONFLICT_FREE_OFFSET(ai); + int offBi = bi + CONFLICT_FREE_OFFSET(bi); + temp[offBi] += temp[offAi]; + } + __syncthreads(); + } + + // Save total sum of this block and clear the last element for down-sweep + if (index == 0) { + int lastIdx = paddedLength - 1; + int offLast = lastIdx + CONFLICT_FREE_OFFSET(lastIdx); + blockSums[blockIdx.x] = temp[offLast]; // total sum of block + temp[offLast] = 0; // set root to 0 for exclusive scan + } + __syncthreads(); + + // Down-sweep phase + for (offset = blockSize; offset >= 1; offset /= 2) { + int idx = (index + 1) * offset * 2 - 1; + if (idx < blockSize) { + int ai = idx - offset; + int bi = idx; + int offAi = ai + CONFLICT_FREE_OFFSET(ai); + int offBi = bi + CONFLICT_FREE_OFFSET(bi); + int t = temp[offAi]; + temp[offAi] = temp[offBi]; + temp[offBi] += t; + } + __syncthreads(); + } + + // Write the exclusive scan results for this block to global memory + if (globalIndex < n) { + odata[globalIndex] = temp[index + CONFLICT_FREE_OFFSET(index)]; + } + } + + // Kernel to add block offsets to each element of the scanned blocks. + __global__ void kernAddBlockOffsets(int n, const int* blockOffsets, int* odata) { + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < n) { + // Determine which block this index belongs to + int blockId = i / blockDim.x; + if (blockId > 0) { + // Add the sum of all preceding blocks to this element + odata[i] += blockOffsets[blockId]; + } + } + } + + /** + * Performs exclusive prefix-sum (scan) on idata, storing the result into odata. + * Uses a work-efficient approach with shared memory for per-block scans and a second pass to add block sums. + */ + void scan(int n, int* odata, const int* idata) { + if (n <= 0) { + return; + } + // Allocate device memory for input and output + int* dev_in = nullptr; + int* dev_out = nullptr; + int* dev_blockSums = nullptr; + int* dev_blockScan = nullptr; + cudaMalloc(&dev_in, n * sizeof(int)); + cudaMalloc(&dev_out, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_in/dev_out for shared scan"); + // Determine number of blocks needed + const int blockSize = 128; + int numBlocks = (n + blockSize - 1) / blockSize; + cudaMalloc(&dev_blockSums, numBlocks * sizeof(int)); + // Allocate array for scanned block sums (pad to next power of two for scanning) + int pow2Blocks = 1 << ilog2ceil(numBlocks); + cudaMalloc(&dev_blockScan, pow2Blocks * sizeof(int)); + checkCUDAError("cudaMalloc blockSums/blockScan for shared scan"); + + // Copy input to device + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy H2D for shared scan input"); + + timer().startGpuTimer(); + // Scan each block's data into dev_out, and write block sums + // Use dynamic shared memory: each block gets (blockSize + blockSize/32) * sizeof(int) bytes + size_t sharedMemSize = (blockSize + (blockSize >> 5)) * sizeof(int); + kernScanBlock <<>> (n, dev_in, dev_out, dev_blockSums); + checkCUDAError("kernScanBlock kernel"); + + // Scan the array of block sums to get the offset for each block + // Copy block sums to padded array for scanning + cudaMemcpy(dev_blockScan, dev_blockSums, numBlocks * sizeof(int), cudaMemcpyDeviceToDevice); + if (pow2Blocks > numBlocks) { + cudaMemset(dev_blockScan + numBlocks, 0, (pow2Blocks - numBlocks) * sizeof(int)); + } + checkCUDAError("cudaMemcpy + cudaMemset for block sums"); + // Upsweep on block sums + for (int stride = 2; stride <= pow2Blocks; stride *= 2) { + int threads = pow2Blocks / stride; + int blocks = (threads + blockSize - 1) / blockSize; + StreamCompaction::Efficient::kernUpSweep <<>> (pow2Blocks, stride, dev_blockScan); + checkCUDAError("kernUpSweep (blockSums) kernel"); + } + // Set last element to 0 for exclusive scan + cudaMemset(dev_blockScan + (pow2Blocks - 1), 0, sizeof(int)); + checkCUDAError("cudaMemset root for block sums scan"); + // Downsweep on block sums + for (int stride = pow2Blocks; stride >= 2; stride /= 2) { + int threads = pow2Blocks / stride; + int blocks = (threads + blockSize - 1) / blockSize; + StreamCompaction::Efficient::kernDownSweep <<>> (pow2Blocks, stride, dev_blockScan); + checkCUDAError("kernDownSweep (blockSums) kernel"); + } + + // Add block offsets to every element in dev_out to get the final scanned output + int fullBlocks = (n + blockSize - 1) / blockSize; + kernAddBlockOffsets <<>> (n, dev_blockScan, dev_out); + checkCUDAError("kernAddBlockOffsets kernel"); + timer().endGpuTimer(); + + // Copy the result back to host + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy D2H for shared scan output"); + + // Free device memory + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(dev_blockSums); + cudaFree(dev_blockScan); + } + } + + + +} diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h new file mode 100644 index 000000000..1178b4ea4 --- /dev/null +++ b/stream_compaction/efficient.h @@ -0,0 +1,22 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Efficient { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata); + } + // Extra credit + + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); //noexcept + void sort(int n, int* odata, const int* idata); // Stable sort from least significant bit to most significant bit + } + namespace Shared { + StreamCompaction::Common::PerformanceTimer& timer(); //noexcept + void scan(int n, int* odata, const int* idata); // Shared memory scan kernel + } +} diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu new file mode 100644 index 000000000..ab792d108 --- /dev/null +++ b/stream_compaction/naive.cu @@ -0,0 +1,89 @@ +#include +#include +#include "common.h" +#include "naive.h" + +namespace StreamCompaction { + namespace Naive { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + // TODO: __global__ + + // Performs one step of the scan, with the given offset + __global__ void kernScanStep(int n, int offset, const int* idata, int* odata) { + + int k = threadIdx.x + blockIdx.x * blockDim.x; // map from threadIdx to array index + + if (k >= n) return; // check array bounds + + if (k >= offset) { + // Add the element from index [k - offset] (previous partial sum) + odata[k] = idata[k] + idata[k - offset]; + } + else { + // Copy the element as is (no element to add from behind) + odata[k] = idata[k]; + } + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + //TODO + // Allocate device memory + if (n <= 0) { + return; // no elements to scan + } + + int* dev_in = nullptr; // input array + int* dev_out = nullptr; // output array + cudaMalloc(&dev_in, n * sizeof(int)); // allocate device memory + cudaMalloc(&dev_out, n * sizeof(int)); // allocate device memory + checkCUDAError("cudaMalloc failed for scan buffers"); // check for errors + + // Exclusive scan initial state: dev_in[0] = 0, dev_in[1..n-1] = idata[0..n-2] + cudaMemset(dev_in, 0, sizeof(int)); // set first element to 0 + if (n > 1) { + cudaMemcpy(dev_in + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); // copy rest of elements + } + checkCUDAError("cudaMemcpy H2D for input data"); // check for errors + + + timer().startGpuTimer(); + // TODO + + // Iterative up-sweep (Hillis-Steele): offset = 1, 2, 4, ... < n + const int blockSize = 128; //128 // number of threads per block + + // Ping-pong buffers: in each iteration, read from dev_in, write to dev_out, then swap + for (int offset = 1; offset < n; offset *= 2) { + + // Launch kernel + int fullBlocks = (n + blockSize - 1) / blockSize; + kernScanStep << > > (n, offset, dev_in, dev_out); // launch kernel + checkCUDAError("kernScanStep kernel"); // check for errors + + cudaDeviceSynchronize(); // wait for kernel to finish, for timing purposes only + // Swap the ping-pong buffers for next iteration + int* temp = dev_in; + dev_in = dev_out; + dev_out = temp; + } + + timer().endGpuTimer(); + + // At this point, dev_in holds the scanned output (because of final swap) + cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); // copy result back to host + checkCUDAError("cudaMemcpy D2H for scan result"); // check for errors + + cudaFree(dev_in); // free device memory + cudaFree(dev_out); // free device memory + } + } +} diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h new file mode 100644 index 000000000..37dcb0643 --- /dev/null +++ b/stream_compaction/naive.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Naive { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu new file mode 100644 index 000000000..7ab620361 --- /dev/null +++ b/stream_compaction/thrust.cu @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include "common.h" +#include "thrust.h" + +namespace StreamCompaction { + namespace Thrust { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + // TODO use `thrust::exclusive_scan` + if (n <= 0) { + return; + } + + // Allocate device_vectors and copy input data + thrust::device_vector dv_in(n); // device_vector manages memory and automatically cleans up + thrust::device_vector dv_out(n); // device_vector manages memory and automatically cleans up + cudaMemcpy(thrust::raw_pointer_cast(dv_in.data()), idata, + n * sizeof(int), cudaMemcpyHostToDevice); // copy input array to device + checkCUDAError("cudaMemcpy H2D for thrust scan input"); + + + timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Use Thrust library's exclusive_scan on the device vector + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + timer().endGpuTimer(); + + // Copy result back to host + cudaMemcpy(odata, thrust::raw_pointer_cast(dv_out.data()), + n * sizeof(int), cudaMemcpyDeviceToHost); // copy output array to host + checkCUDAError("cudaMemcpy D2H for thrust scan output"); + } + } +} diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h new file mode 100644 index 000000000..fe98206b3 --- /dev/null +++ b/stream_compaction/thrust.h @@ -0,0 +1,11 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace Thrust { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + } +}