Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zero 2W 64bit port #261

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ set(DEFAULT_TO_SINGLE_CORE_BOARD OFF)
set(DEFAULT_TO_ARMV6Z OFF)
set(DEFAULT_TO_ARMV7A OFF)
set(DEFAULT_TO_ARMV8A OFF)
set(DEFAULT_USE_VCSM_CMA OFF)

# http://ozzmaker.com/check-raspberry-software-hardware-version-command-line/
if (BOARD_REVISION MATCHES "(0002)|(0003)|(0004)|(0005)|(0006)|(0007)|(0008)|(0009)" OR BOARD_REVISION MATCHES "(000d)|(000e)|(000f)|(0010)|(0011)|(0012)" OR BOARD_REVISION MATCHES "(900092)|(900093)|(9000c1)")
Expand All @@ -46,7 +47,21 @@ if (SINGLE_CORE_BOARD)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSINGLE_CORE_BOARD=1")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
option(AARCH64 "Target a Raspberry Pi with aarch64 architecture" NO)
if (AARCH64)
message(STATUS "Enable AARCH64 build")
set(DEFAULT_USE_VCSM_CMA ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlittle-endian -funsafe-math-optimizations -DTIMER_32BIT")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
endif()

option(USE_VCSM_CMA "Map Memory from CPU instead of GPU" ${DEFAULT_USE_VCSM_CMA})
if (USE_VCSM_CMA)
message(STATUS "Enabling Map Memory from CPU instead of GPU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_VCSM_CMA=1")
endif()


option(ARMV6Z "Target a Raspberry Pi with ARMv6Z instruction set (Pi 1A, 1A+, 1B, 1B+, Zero, Zero W)" ${DEFAULT_TO_ARMV6Z})
if (ARMV6Z)
Expand Down
63 changes: 63 additions & 0 deletions cma.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifdef USE_VCSM_CMA

#include "config.h"
#include "cma.h"
#include "util.h"
#include <sys/ioctl.h>
#include <fcntl.h>
#include <syslog.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

static int cma_fd = -1;
#define PAGE_SIZE 4096

void OpenVCSM(void) {
cma_fd = open("/dev/vcsm-cma", O_RDWR|O_SYNC);
if (cma_fd < 0) FATAL_ERROR("can't open /dev/vcsm-cma");
}

void CloseVCSM(void) {
if (cma_fd >= 0) {
close(cma_fd);
}
}

const int NAME_LENGTH = 32;

struct Allocate {
/* user -> kernel */
uint32_t size;
uint32_t num;
uint32_t flags;
uint32_t pad;
char name[NAME_LENGTH];

/* kernel -> user */
int32_t fd;
uint32_t vcHandle;
uint64_t dmaAddr;
};

int AllocateCMA(const char* reason, size_t req, CMAInfo* res) {
if (res == NULL) {
return -1;
}
Allocate ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.size = ALIGN_UP(req, PAGE_SIZE);
ctx.flags = 0; // NO cache
strncpy((char*)ctx.name, reason, NAME_LENGTH -1);
ctx.num = 1;
if (ioctl(cma_fd, _IOR('J', 0x5A, struct Allocate), &ctx) < 0 || ctx.fd < 0) { // allocate cmd
return -1;
}
res->size = ctx.size;
res->vcHandle = ctx.vcHandle;
res->dmaAddr = ctx.dmaAddr;
res->fd = ctx.fd;
return 0;
}

#endif
16 changes: 16 additions & 0 deletions cma.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once
#ifdef USE_VCSM_CMA

#include <memory.h>
#include <inttypes.h>
struct CMAInfo {
size_t size;
uintptr_t dmaAddr;
uint32_t fd;
uint32_t vcHandle;
};

void OpenVCSM(void);
void CloseVCSM(void);
int AllocateCMA(const char* reason, size_t req, CMAInfo* res);
#endif
37 changes: 35 additions & 2 deletions dma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#include "util.h"
#include "mailbox.h"

#ifdef USE_VCSM_CMA
#include "cma.h"
#endif

#ifdef USE_DMA_TRANSFERS

#define BCM2835_PERI_BASE 0x3F000000
Expand All @@ -36,6 +40,9 @@ struct GpuMemory
void *virtualAddr;
uintptr_t busAddress;
uint32_t sizeBytes;
#ifdef USE_VCSM_CMA
uint32_t vcHandle;
#endif
};

#define NUM_DMA_CBS 1024
Expand Down Expand Up @@ -127,7 +134,33 @@ void FreeDMAChannel(int channel)
#define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress)

uint64_t totalGpuMemoryUsed = 0;
#ifdef USE_VCSM_CMA

void FreeUncachedGpuMemory(GpuMemory mem) {
munmap(mem.virtualAddr, mem.sizeBytes);
close(mem.allocationHandle);
}

GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason) {
GpuMemory mem;
CMAInfo ctx;
if (AllocateCMA(reason, numBytes, &ctx) != 0) {
FATAL_ERROR("alloc cma failed");
}
mem.sizeBytes = ctx.size;
mem.busAddress = ctx.dmaAddr;
mem.allocationHandle = ctx.fd;
mem.vcHandle = ctx.vcHandle;
mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, ctx.fd, 0);
totalGpuMemoryUsed += mem.sizeBytes;
if (mem.virtualAddr == MAP_FAILED) {
FreeUncachedGpuMemory(mem);
FATAL_ERROR("Failed to mmap CMA memory!");
}
printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed);
return mem;
}
#else
// Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block.
// The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use
// this kind of memory to pass data blocks over to the DMA controller to process.
Expand All @@ -154,6 +187,7 @@ void FreeUncachedGpuMemory(GpuMemory mem)
Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle);
Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle);
}
#endif

volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber)
{
Expand Down Expand Up @@ -720,8 +754,7 @@ void SPIDMATransfer(SPITask *task)
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
{
CheckSPIDMAChannelsNotStolen();
if (tick() - dmaTaskStart > 5000000)
FATAL_ERROR("DMA TX channel has stalled!");
if (tick() - dmaTaskStart > 5000000) FATAL_ERROR("DMA TX channel has stalled!");
}
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
{
Expand Down
27 changes: 23 additions & 4 deletions spi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ void ChipSelectHigh();
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
#endif

#ifdef USE_VCSM_CMA
#include "cma.h"
#endif

static uint32_t writeCounter = 0;

#define WRITE_FIFO(word) do { \
Expand All @@ -49,7 +53,11 @@ volatile SPIRegisterFile *spi = 0;

// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
#ifdef TIMER_32BIT
volatile systemTimer *systemTimerRegister = 0;
#else
volatile uint64_t *systemTimerRegister = 0;
#endif

void DumpSPICS(uint32_t reg)
{
Expand Down Expand Up @@ -510,13 +518,20 @@ int InitSPI()
// Memory map GPIO and SPI peripherals for direct access
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
#ifdef USE_VCSM_CMA
OpenVCSM();
#endif
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#ifdef TIMER_32BIT
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE);
#else
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
#endif
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#endif

uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
Expand Down Expand Up @@ -553,10 +568,11 @@ int InitSPI()
// Set the SPI 0 pin explicitly to output, and enable chip select on the line by setting it to low.
// fbcp-ili9341 assumes exclusive access to the SPI0 bus, and exclusive presence of only one device on the bus,
// which is (permanently) activated here.
SET_GPIO_MODE(GPIO_SPI0_CE0, 0x01);
CLEAR_GPIO(GPIO_SPI0_CE0);
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably didn't mean to permanently change this to CS1 outside of the #ifdef DISPLAY_USES_CS1 below?

CLEAR_GPIO(GPIO_SPI0_CE1);
#ifdef DISPLAY_USES_CS1
SET_GPIO_MODE(GPIO_SPI0_CE1, 0x01);
CLEAR_GPIO(GPIO_SPI0_CE1);
#endif
#endif

Expand Down Expand Up @@ -658,6 +674,9 @@ void DeinitSPI()
close(mem_fd);
mem_fd = -1;
}
#ifdef USE_VCSM_CMA
CloseVCSM();
#endif

#ifndef KERNEL_MODULE_CLIENT

Expand Down
13 changes: 13 additions & 0 deletions tick.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,21 @@
#include <unistd.h>

// Initialized in spi.cpp along with the rest of the BCM2835 peripheral:
#ifdef TIMER_32BIT
struct __attribute__((packed, aligned(4))) systemTimer {
volatile uint32_t cs;
volatile uint32_t clo;
volatile uint32_t chi;
volatile uint32_t c[4];
};
#define TIMER_TYPE systemTimer
extern volatile systemTimer* systemTimerRegister;
#define tick() (((uint64_t)systemTimerRegister->clo) | ((uint64_t)(systemTimerRegister->chi) << 32))
#else
#define TIMER_TYPE uint64_t
extern volatile uint64_t *systemTimerRegister;
#define tick() (*systemTimerRegister)
#endif

#endif

Expand Down