diff options
Diffstat (limited to 'Notes')
-rw-r--r-- | Notes/BugStories.txt | 48 | ||||
-rw-r--r-- | Notes/CodegenCEmbeddedTags.diff | 168 | ||||
-rw-r--r-- | Notes/MIR-Optimisations.md | 67 | ||||
-rw-r--r-- | Notes/MIR-PackedLValue.txt | 65 | ||||
-rw-r--r-- | Notes/NOTES.txt | 177 | ||||
-rw-r--r-- | Notes/SMiri-GenericFfi.md | 46 | ||||
-rw-r--r-- | Notes/TopLevelTypecheck.md | 17 | ||||
-rw-r--r-- | Notes/Typeck.txt | 12 | ||||
-rw-r--r-- | Notes/todo.txt | 55 |
9 files changed, 649 insertions, 6 deletions
diff --git a/Notes/BugStories.txt b/Notes/BugStories.txt new file mode 100644 index 00000000..b7437fb6 --- /dev/null +++ b/Notes/BugStories.txt @@ -0,0 +1,48 @@ +2019-07-13: Sizeof mismatch +=========================== + +librustc takes ~20mins to recompile, and that's just one crate out of ~80 in rustc +- rustc failing on a consistency check +- Turn on debug logging, add more to find where things become inconsistent +- add assertion that a hashmap insert worked +- started crashing earlier (but same place) +- Print the map, item is there +- Start dumping hashes, they don't match +- Check equality of pre-hash values, they match +- Check input to hasher, don't see anything too odd +- Add more debug to make it easier to see the instance hashig +- Notice the difference, a pointer difference? +- Match the hash inputs to fields, find an odd pair with the `ty::Slice` type +- Start chasing it's Hash impl down, it's supposed to just hash the pointer + - Actual impl hashes two words, not just one + - The source code checks sizeof to pick between one/two word hashing + - But post-optimisation it's always picking two + - Turn off that optimisation and rebuild librustc, no change? +- Check libcore's metadata, it has the bug already (in the generic version?) +- Enable optimisation debug and rebuild libcore +- Oh look, `sizeof<*const T>` where `T: ?Sized` is returning 16 instead of returning "I don't know yet" + + +2019-07-20: Leaking MPSC handles +================================ + +- rustc just stopping after codegen +- gdb backtrace shows only one thread, waiting on a MPSC receiver +- add debugging for MPSC Shared handles for that specific type, LOOTS of them being made (testing on a 14 core machine) +- Turn down codegen units to 1, now a total of 6 handles (much easier) +- Check all allocation paths, looks like all of them should call the destructor on the returned handle... + - One path hands a handle to a thread, let's chase that down + - Nope... that destuctor does get called... hmm... +- Break down (after two weekends) and hack in handle indexes to mpsc::Shared + (on clone, allocate a unique ID and store that in the handle). +- Re-run, printing the handle indexes - Notice that one code path (two + handles) leaks its handles +- Re-check, the destructor is called... but I can't break on it? +- Chase down the call chain, reach a Box drop impl (wait... that exists, I + thought the compiler made a custom one) + - It does absolutely nothing. No free, no destructor... oops +- Turns out that 1.29 added a no-op Drop impl for Box (1.19 didn't have one), which + caused mrustc's magic Drop impl to not be created. + +<!-- vim: ft=markdown +--> diff --git a/Notes/CodegenCEmbeddedTags.diff b/Notes/CodegenCEmbeddedTags.diff new file mode 100644 index 00000000..62fea1cb --- /dev/null +++ b/Notes/CodegenCEmbeddedTags.diff @@ -0,0 +1,168 @@ +diff --git a/src/trans/codegen_c.cpp b/src/trans/codegen_c.cpp +index 9af87d47..bcf296db 100644 +--- a/src/trans/codegen_c.cpp ++++ b/src/trans/codegen_c.cpp +@@ -1385,12 +1385,22 @@ namespace { + + void emit_enum_path(const TypeRepr* repr, const TypeRepr::FieldPath& path) + { +- if( TU_TEST1(repr->variants, Values, .field.index == path.index) ) ++ if( TU_TEST1(repr->variants, Values, .field.index == path.index) || path.index == ~0u ) + { +- m_of << ".TAG"; ++ // TODO: The tag can be within the union ++ if( repr->fields.size() >= 2 ) ++ { ++ m_of << ".DATA.var_tag.TAG"; ++ } ++ else ++ { ++ m_of << ".TAG"; ++ } ++ return ; + } + else + { ++ // TODO: Support for non-zero offsets? + m_of << ".DATA.var_" << path.index; + } + const auto* ty = &repr->fields[path.index].ty; +@@ -1458,55 +1468,44 @@ namespace { + assert(1 + union_fields.size() + 1 >= repr->fields.size()); + // Make the union! + // NOTE: The way the structure generation works is that enum variants are always first, so the field index = the variant index +- // TODO: +- if( !this->type_is_bad_zst(repr->fields[0].ty) || ::std::any_of(union_fields.begin(), union_fields.end(), [this,repr](auto x){ return !this->type_is_bad_zst(repr->fields[x].ty); }) ) ++ m_of << "\tunion {\n"; ++ bool is_tagged = (1 + union_fields.size() < repr->fields.size()); ++ // > First field ++ for(size_t idx = 0; idx < repr->fields.size()-(is_tagged ? 1 : 0); idx ++) + { +- m_of << "\tunion {\n"; +- // > First field +- { +- m_of << "\t\t"; +- const auto& ty = repr->fields[0].ty; +- if( this->type_is_bad_zst(ty) ) { +- m_of << "// ZST: " << ty << "\n"; +- } +- else { +- emit_ctype( ty, FMT_CB(ss, ss << "var_0") ); +- m_of << ";\n"; +- //sized_fields ++; +- } ++ m_of << "\t\tstruct {"; ++ const auto& ty = repr->fields[idx].ty; ++ if( this->type_is_bad_zst(ty) ) { ++ m_of << "/* ZST: " << ty << "*/"; + } +- // > All others +- for(auto idx : union_fields) ++ else { ++ emit_ctype( ty, FMT_CB(os, os << "d") ); ++ m_of << ";"; ++ //sized_fields ++; ++ } ++ if( is_tagged ) + { +- m_of << "\t\t"; +- +- const auto& ty = repr->fields[idx].ty; +- if( this->type_is_bad_zst(ty) ) { +- m_of << "// ZST: " << ty << "\n"; +- } +- else { +- emit_ctype( ty, FMT_CB(ss, ss << "var_" << idx) ); +- m_of << ";\n"; +- //sized_fields ++; ++ auto padbytes = repr->fields.back().offset - Target_GetSizeOf(ty); ++ if( padbytes > 0 ) ++ { ++ m_of << "char pad[" << padbytes << "];"; + } ++ emit_ctype(repr->fields.back().ty, FMT_CB(os, os << "TAG")); m_of << ";"; + } +- m_of << "\t} DATA;\n"; ++ m_of << "} var_" << idx << "\n"; + } +- +- if( repr->fields.size() == 1 + union_fields.size() ) ++ // NOTE: The tag can be placed within variants (overlapping with padding bytes) ++ if( !is_tagged ) + { + // No tag, the tag is in one of the fields. + DEBUG("Untagged, nonzero or other"); + } + else + { +- //assert(repr->fields.back().offset != repr->fields.front().offset); + DEBUG("Tag present at offset " << repr->fields.back().offset << " - " << repr->fields.back().ty); +- +- m_of << "\t"; +- emit_ctype(repr->fields.back().ty, FMT_CB(os, os << "TAG")); +- m_of << ";\n"; ++ m_of << "\t\tstruct { char pad[" << repr->fields.back().offset << "]; "; emit_ctype(repr->fields.back().ty, FMT_CB(os, os << "TAG")); m_of << "; } var_tag;\n"; + } ++ m_of << "\t} DATA;\n"; + } + else if( repr->fields.size() == 1 ) + { +@@ -1591,7 +1590,7 @@ namespace { + { + auto var_lv =::MIR::LValue::new_Downcast(mv$(self), 0); + +- m_of << "\tswitch(rv->TAG) {\n"; ++ m_of << "\tswitch((*rv)";emit_enum_path(repr, {~0u}); m_of << ") {\n"; + for(unsigned int var_idx = 0; var_idx < e->values.size(); var_idx ++) + { + m_of << "\tcase " << e->values[var_idx] << ":\n"; +@@ -1652,7 +1651,7 @@ namespace { + { + case TypeRepr::VariantMode::TAGDEAD: throw ""; + TU_ARM(repr->variants, Values, ve) { +- m_of << " .TAG = "; emit_enum_variant_val(repr, var_idx); m_of << ","; ++ m_of << " .DATA = { .var_tag = { .TAG = "; emit_enum_variant_val(repr, var_idx); m_of << " }},"; + } break; + TU_ARM(repr->variants, NonZero, ve) { + } break; +@@ -1949,7 +1948,7 @@ namespace { + emit_literal(ity, *e.val, params); + m_of << " }, "; + } +- m_of << ".TAG = "; emit_enum_variant_val(repr, e.idx); ++ emit_enum_path(repr, {~0u}); m_of << " = "; emit_enum_variant_val(repr, e.idx); + m_of << "}"; + } + ), +@@ -3292,11 +3291,11 @@ namespace { + } + else if( enm_p->is_value() ) + { +- emit_lvalue(e.dst); m_of << ".TAG = "; emit_enum_variant_val(repr, ve.index); ++ emit_lvalue(e.dst); emit_enum_path(repr, {~0u}); m_of << " = "; emit_enum_variant_val(repr, ve.index); + } + else + { +- emit_lvalue(e.dst); m_of << ".TAG = "; emit_enum_variant_val(repr, ve.index); ++ emit_lvalue(e.dst); emit_enum_path(repr, {~0u}); m_of << " = "; emit_enum_variant_val(repr, ve.index); + + ::HIR::TypeRef tmp; + const auto& vty = mir_res.get_param_type(tmp, ve.val); +@@ -3542,7 +3541,7 @@ namespace { + if (ve.type.m_data.is_Primitive() && ty.m_data.is_Path() && ty.m_data.as_Path().binding.is_Enum()) + { + emit_lvalue(ve.val); +- m_of << ".TAG"; ++ m_of << ".TAG"; // TODO: Fix for tag locations? (may not be needed for value-only enums) + special = true; + } + if (!special) +@@ -3605,7 +3604,7 @@ namespace { + case ::HIR::CoreType::Str: + MIR_BUG(mir_res, "Unsized tag?!"); + } +- m_of << indent << "switch("; emit_lvalue(val); m_of << ".TAG) {\n"; ++ m_of << indent << "switch("; emit_lvalue(val); emit_enum_path(repr, {~0u}); m_of << ") {\n"; + for(size_t j = 0; j < n_arms; j ++) + { + // TODO: Get type of this field and check if it's signed. diff --git a/Notes/MIR-Optimisations.md b/Notes/MIR-Optimisations.md index 57f29c88..df7d8239 100644 --- a/Notes/MIR-Optimisations.md +++ b/Notes/MIR-Optimisations.md @@ -18,6 +18,19 @@ If a recorded local is used via field access, record the field and location At the end of the block, remove original tuple assignment and propagate the values to destinations. +Simple De-Temporary +=================== + +Purpose: Remove single-use temporaries +`_1 = ...; _2 = _1` -> `_2 = ...` + +Algorithm +--------- + +- Locate locals that are only every read/written once +- If the use is an assignment AND the destination isn't invalidated between the first assign and second + - Replace the original assignment with the second destination + De-Temporary (version 1) ======================== @@ -186,3 +199,57 @@ CFG Simplification ================== Purpose: Remove gotos to blocks that only have a single use + +Borrow Elimination +================== +Purpose: Remove borrows generated by method calls that have been inlined + +Overview +-------- +- Find locals that are only assigned once, and only ever used via a deref + operation. + - Allow drops to use the destination by value +- If the assignment was a Borrow RValue, continue +- Iterate forward, replacing usage of the value with the original source while: + - The source isn't invalidated + - The number of replacements is less than the number of known usage sites +- If the original borrow itself contained a deref of a local, update that + local's usage count. +- If all usage sites were updated, erase the original borrow + + +Borrow Elimination 2 +==================== +Purpose: Remove borrows of borrows that are never used again + +Overview +-------- +- Find assign-once/use-once locals (ignoring drop) +- If the source is a borrow of a deref of a local/argument, continue +- If the source inner value is of the same type as the local, continue + - I.e. we're not re-borrowing a `&mut` as `&` +- If the variable's type is not `&` (i.e. it's &mut or &uniq) + - Check the usage count for the source value, and only continue is this is + the only usage site of the source + - Ideally: This would only check forwards, but that's hard +- If the variable's type is `&`, check the path between assignment and usage + for invalidation. +- If no invalidation found, replace usage with inner of assignment and erase + assignment. + + +Slice Transmute +=============== +Purpose: Detect places where slices are createdby transmuting from a (usize,usize) and replace with +dedicated MIR statement. + +Overview +-------- +- Find transmute calls with a destination type of `&[T]` +- Check the input type: + - `(usize,usize)` : good + - `struct Foo { usize, usize }` : good +- Check if the input was a use/write once local +- Locate assignment of input + - Check if it was a struct/tuple literal +- Remove assignment, replace transmute with at `MAKEDST` op diff --git a/Notes/MIR-PackedLValue.txt b/Notes/MIR-PackedLValue.txt new file mode 100644 index 00000000..ce889aee --- /dev/null +++ b/Notes/MIR-PackedLValue.txt @@ -0,0 +1,65 @@ +Problem statement: +- MIR LValues are very common, and suffer from excessive indirection when + dereferences and field accesses are present +- Many MIR analysis passes care most about the inner values +- Pointer chasing ruins cache locality + +Solution: Replace the tagged union tree with a flatteded structure + +Quirk: Indexing takes two LValues to produce one BUT one of those is of a +lesser class, so doesn't need to be treated the same. + + + +Structure proposal: +---- +A LValue is made up of: +- A root value (referencing a local, argument, static, or the return value) +- And a list of wrappers (dereference, field, downcast, index) + +Root values are encoded as a packed pointer/value and tag, with the tag stored in the low 2 bits of the pointer +- This allows a 32-bit pointer to a word to be stored, using the alignment bits as tag +- Arguments and locals are encoded with the argument/local index in the "data" bits +- Return value is encoded as argument `-1` (all 1 bits in data) +- Statics are encoded as a pointer to a `::HIR::Path` + - This adds a new pointer access and allocation vs the existing LValue structure + - HIR::Path already has a bunch of pointers in it, may not hurt that much (and may help by keeping the normal size + of the LValue down) + +Wrappers are stored as a vector of words, packed in a similar way to root values +- Dereference is stored just as an entry (could pack multiple derefs into one, but that would make handling more + difficult) +- Field/Downcast is stored with the field/variant index in the "data" bits +- Indexing is stored as a pointer to a LValue + - ALTERNATIVE: Could require that indexing always uses a local, and just store the local index + - This would vastly reduce complexity in handling the Index wrapper, BUT would add a new statement in some cases + - A quick scan of the MMIR output of libstd crates, shows that the vast majority of indexing cases are with a local + directly. + - Doing so would simplify monomorph/clone/serialise (no need to clone the wrapper list, just copy it). + - Would also improve comparison times + + + +Usecase comparisons +------ +(Using 32-bit architecture) + +NOTES: +- Existing `LValue` structure is 3 words long (1 tag, plus 2 pointers for largest variant) + - WRONG. It's actually far larger due to the ::HIR::Path embedded in it (estimate at least 8 pointers, very likely + more). That should be fixed given the number of LValue-s that exist + - Fixed now, had a slight improvement in compile times (and memory usage?) +- New structure is 4 words long (root value, plus len/cap/ptr for vector) + +- Field access via `&self` + - Old: Field(Deref(Argument(0)), 0) + - 12 + 12 + 12 = 36 bytes w/ 2 pointers + - New: LValue( Argument(0), { Deref, Field(0) } ) + - 16 + 8 = 24 bytes w/ 1 pointer + +- Array stored in `&self` (common in librand) + - `(*arg0).2[var16].0` + - Old: Field( Index(Field(Deref(Argument(0)), 2), Local(16)), 0 ) + - 12 * 5 + 12 = 72 bytes 2/ 5 pointers + - New: LValue( Argument(0), { Deref, Field(2), Index(16), Field(0) } ) + - 16 + 16 = 32 bytes w/ 1 pointer diff --git a/Notes/NOTES.txt b/Notes/NOTES.txt new file mode 100644 index 00000000..40c81152 --- /dev/null +++ b/Notes/NOTES.txt @@ -0,0 +1,177 @@ + +Interesting topics: +- Evolution of the codebase (from initial AST-C conversion, through to current MIR based codegen) +- Typecheck pain. Complications from inferrence combined with coercions +- Shortcuts taken: no borrowck, no privacy (suprisingly works), no coherence +- MIR match generation +- Design choices and deviation: MIR structure, proc macros +- `macro_rules!`, aborted attempts +- C backend pain (compiler bugs, standards quirks, workarounds) +- Future Evolution: Borrowcheck, archetectual changes (for consteval, expand/resolve interaction) + + +Introduction +============ +- What is it +- Origin and original intention + - Always been the intention to convert rust into C + - Meant to break the bootstrap chain + - But also for fun + - Started as just a parser and AST, with the intention of just converting the AST straight into C +- Frozen version + - Frozen to target rustc 1.19, because that was the stable version at the time + - The built rustc can then go and build 1.20 and bootstrap continues down the line. +- Rewrites + - No SW engineering project gets things right first time, almost everything has been through at least one re-factor +- Architecture + - Parsing, expansion, and name resolution + - HIR generated from name-resolved AST (simpler, no more relative paths) + - Consteval, typechecking, closure expansion + - MIR generated from the fully type-annotated HIR (high-level, rust-specialised assembly) + - MIR validated, optimised, validated again, monomorphised, optimised again + - MIR passed to the codegen backend (C generator usually) + - Module tree and MIR serialised and saved (for libraries) +- Side-projects + - `minicargo` - Clone of cargo, but specialised for building rustc/cargo + - `standalone_miri` - Just-for-fun miri clone, with some very rough FFI support + + +Evolution +========= +- Started as effectively just a parser back in 2014 +- Original intent was to convert the AST directly into C code +- First attempt at typecheck was in AST, but AST has constructs that only apply before expand/resolve +- HIR added to simplify post-expand passes (remove syntatic sugar, relative paths, ...) +- HIR Typecheck originally worked directly with the syntax tree, worked up to a point +- HIR Typecheck v2 switched to enumerating rules/inferrence variables from the source tree, and then resolved those until stability. + - Typecheck is still a pain the rear, enough to make another article. +- HIR went direct to MIR (this was after rustc gained a MIR) + - This isn't exactly nesessesary, as it should be possible to convert HIR to C. + - But, at this stage, mrustc was so close to a fully-fledged compiler, might as well go the full way +- `macro_rules!` - Rewritten a few times, but that's its own article +- MIR optimisation added not long after codegen, in an attempt reduce time spent in the C compiler (it worked, but optimisation is expensive) + + +Typecheck Pain +============== +- Rust has type inferrence interacting with coercions and trait resolution +- Rustc's type resolution (... as far as I know) works directly on the AST (well, HIR/HAIR) +- Mrustc did that to begin with, but it makes reasoning inferrence interactions hard +- Re-wrote type resolution to build up equality, coercion, and complex rules and iterates over them to do type equalities + - The complex rules handle things like method calls, operators, casts, patterns with `..` in them, ... + +Inferrence on its own is relatively easy, fixed coercion sets (and associated coercion points) are a little harder, but once trait bounds are introduced the system becomes extremely compleex. + + +Shortcuts +======== +- Borrow checking (by-design) doesn't impact codegen in any way +- Privacy technically impacts name lookups (with autoderef), but isn't used in rustc/cargo +- Coherence just exists to aid the reader/compiler, mrustc just searches every known impl (becuase it's simpler) + +MIR Match Generation +==================== +Pattern matching is a complex part of rust, especially if you're interested in generating efficient code. + +The simplest model just handles match statements as chained `if let` statements (check each pattern, and fall on to the next one if it doesn't match). +This is actually how `match` logically works (you can see this when using `if` guards on match arms, or by having a `_` pattern before other patterns), and in the degerate case chained attern checks are the only correct conversion. + +For efficiency though, you'd want to convert match into effectively a C switch statment (a jump table, binary search, or just an easy-to-predict chain of comparisons). + +Mrustc does this by first turning match arms into a list of pattern rules (simplified patterns that handle destructuring and value ranges), then checking if it's possible to re-order and combine the patterns into an efficient form. If possible, then the arms are sorted and parsed into a decision tree in MIR (using binary searches, switch-like constructs, and simple equalitiy checks). If not possible (mostly if there's if-guarded arms), then the rules are turned into chained pattern checks and left with the hope that the backend can optimise them better. + +All said and done, MIR generation for match statements clocks in to be larger than the rest of MIR generation (by a significant margin), and that's not accounting for the sections it shares with the rest (scope handling is another aspect that is complicated by match statements and how they extend value lifetimes). + + +Design Choices +============== + +MIR +--- +- Prefer a smaller generated MIR over conceptual simplicity +- Doesn't follow SSA, instead re-using and de-duplicating variables wherever possible + +Procedual Macros +---------------- +- Had a chance to do proc macros the "right" way +- Entirely IPC-based, with each macro invocation spawning a fresh process +- IPC is via basic serialisation over stdin/stdout + +C as the primary backend +------------------------ +- A pain in the rear to avoid compiler bugs (and undefined behavior) BUT +- C is everywhere, while LLVM (and to a lesser extent gcc) have a constained target range +- Even with LLVM-supported targets, building/linking LLVM is a non-trivial task. + - mrustc's primary goal is to allow building rustc with minimal third-party code (just a working C++ compiler, which nearly always includes a working C compiler) + + +`macro_rules!` attemts +---------------------- + +At first, macro handling went the full C++ way - for each arm the parser would attempt to parse the arm, and catch the syntax error exception and move on to the next arm. This wasn't exactly efficient, or easy to debug, but it did ensure that the macro evaluation parser and the main parser were the same. The dowside here was that, because the lexer has to create tokens to hand out to the parser, and it's not known when parsing if another arm might still be tried, expensive copies need to be made of every single captured variable (even if they only ever get used once). + +The second attempt went the complete oposite way, attempting to parse the entire set of macro patterns into a decision tree. Turns out that rust macros are very like rust match statemets, they logically work by trying each arm in sequence and pick the first one that works. This turns out to make a simple decision tree impossible (sometimes macros will have ambigious captures, e.g. one arm capturing a :ident and another :expr - both could match). + +The third and final model uses a vastly simplified version of the main parser, that just checks if an input token stream is valid for the specified syntax fragment. Combined with a "lexer" that doesn't hand out ownership of the captured items, this new model is both correct and efficient (only doing expensive copies when captures are used multiple times). The only downside is that now there's two parsers involved with macro expansion (and it's easy for them to get out of sync). + +C Pains +======= + +- Zero-sized structures + - Stritly-speaking not allowed in C, required a lot of work to remove +- gcc/msvc featureset mismatches + - The C backend has hard-coded asm to intrinsic mappings for MSVC +- Alignment quirks + - Turns out that `uint64_t` has an alignment of 4 on x86 gcc +- Compiler bugs! + - Older versions of gcc treating zero-sized structures as UB (and mis-optimising because of it) + - x86 FPU overflow, still an outstanding issue +- Slooooow + - Despite optimisation in mrustc, translating MIR directly to C is slow, and the C optimisation/codegen phase isn't fast either. + + +Rustc Bootstrap +============== + +Proc macros +----------- +1.29.0's bootstrap program uses proc macro derive, so ends up dynamically +linking with rustc. This leads to an ABI disagreement if rustc was built with +mrustc. + +Solution: Build rustc manually using itself, by patching minicargo to support +running with rustc. +OR: Build rustc using a manual cargo invocation. +- If this uses proc macros... bootstrap is "broken" at 1.29 + + +Future Evolution +================ + +Upgrade target version +---------------------- + +Mrust targets rustc 1.19 currently, which is over a year old at this point. Upgrading however isn't a simple task, as new compiler features have been added (notable MIR-based consteval, which requires doing MIR generation for some items before typecheck is possible on others). + +Borrow Check +------------ + +More than one MIR generation issue has lead to bugs that could have been caught by a borrowcheck implementation, Compared to other parts of the compiler (looking at you match generation), a borrow checker itself wouldn't be too hard. What is hard about adding a borrow checkers is region inferrence (propagating lifetime annotations around the type checking phase) + + +On-demand codegen +----------------- + +As part of upgrading the target version, and just removing old and redundant code, it'd be nice to support full MIR constant evaluation (currently the consteval phase has two evaluatorss - one handling local HIR, the other handling MIR from external sources). Generating MIR (and doing typeheck of required items) during the constant evaluation phase would completely remove the HIR evaluator, and all of the type guessing performed to allow HIR constant evaluation (which routinely falls over when exposed to new code). + +Parser Re-write +--------------- + +The mrustc parser is ancient, with some functions dating back to the start of the project in 2014. On its own, this isn't a problem, but the parsing model itself makes both debugging nd extending the parser challenging. The current model works with a push-pull lexer (which hands out tokens, and allows the caller to push the token back if not needed). More recent code (e.g. the MIRI parser) use a peek/consume model, where the lexer "owns" the tokens until the token is actually consumed - much simpler to debug (via logging when tokens are consumed) and is a closer match to how the lexer gets used in practice. + + + + +<!-- vim: ft=markdown +--> + diff --git a/Notes/SMiri-GenericFfi.md b/Notes/SMiri-GenericFfi.md new file mode 100644 index 00000000..d43b44ad --- /dev/null +++ b/Notes/SMiri-GenericFfi.md @@ -0,0 +1,46 @@ +Problem description: +==================== + +There's a collection of hard-coded FFI hooks in `miri.cpp` to handle both API +rules (e.g. that the `write` syscall takes a count and a buffer of at least +that valid size) and wrapping non-trivial or dangerous calls (e.g. the pthread +ones). + + +It would be more useful to have a runtime description/encoding of these APIs +that runs safety checks and handles the magic. + + +Requirements +============ + +- Validity checks (pointer validity, tags) + - These checks should provide useful error messages +- Returned allocation tagging (valid size, opaque tags) + - Some returned allocations can be complex +- Completely re-definining operations + - E.g. replacing pthread APIs with checked versions + + + +Design Ideas +============ + +Raw MIR? +------- +- Downside: How would it differ from actual MIR? + + +Basic-alike instruction sequence +-------------------------------- +- All lines start with a keyword + - `ASSIGN` + - `LET` + - `CALL` + + +Simplified rust-ish language +---------------------------- +- Basic type inference (uni-directional) +- Full AST + diff --git a/Notes/TopLevelTypecheck.md b/Notes/TopLevelTypecheck.md new file mode 100644 index 00000000..de7b945c --- /dev/null +++ b/Notes/TopLevelTypecheck.md @@ -0,0 +1,17 @@ +Problem: Top-level typecheck is fragile, doesn't handle bounds referencing +each other. + + +Can generate a similar rule structure to the expression handling? + + +Or, create a new typecheck helper that can handle partially-checked bounds + + + +Requirements: +- UFCS resoluton in bounds and impl headers + - Does this need to search other impls? + - A quick check with the playpen implies that non-generics require fully-qualified paths. + - But Generics don't. + diff --git a/Notes/Typeck.txt b/Notes/Typeck.txt index 06b392c0..3ff88c5b 100644 --- a/Notes/Typeck.txt +++ b/Notes/Typeck.txt @@ -27,5 +27,17 @@ Needs to be able to point to functions in: Maybe can use separate types for each usecase? + + +# Hard Questions + +## libgit2 inferrence quirk +Query on rust type inferrence rules (as designed/intended, not as implemented) + +Can the presence of a trait impl influence type inferrence for the Self type? + +- `*mut _ : Convert<*mut T>`, if there's only one type that implements that + trait, is it valid to infer `_` based on that impl? + <!-- vim: ft=markdown --> diff --git a/Notes/todo.txt b/Notes/todo.txt index 7041c8b9..5b244a34 100644 --- a/Notes/todo.txt +++ b/Notes/todo.txt @@ -6,24 +6,62 @@ TODO: - Remove variables that are just assigned from arguments - Clean up AST - Almost done, just a few little niggles left -- Split arg patterns and arg types up for cleaner serialisation - - Could be a good idea to do as part of changing HIR::TraitImpl to only contain the impl data - - May not be too useful due to argument monomorphisation. - Optimise typecheck. + - Mostly done with trait resolve cleanup ## Big change TODOs - Support MIR-only RLibs + - Defer C codegen until final binary generation? +- Dylib support (requires trans API restructure) + - Will improve disk usage for tests (can dynamically link libstd) - Fix Span annotations + - Spans are missing on HIR items, sub-par reporting in typecheck and later + - Spans can be off by a line or two - Refactor parse to use a consume model lexer - Optimise optimise (and typecheck) + - Partially down with trait resolution optimisation. Needs further profiling - Complete structed C codegen + - Upside: It'll look cool + - Unknown: Will it be faster? +- RTL/register-based SSA interrim backend + - Convert MIR into a form that can be handed over to LLVM or Cranelift (or GIMPLE) + - Use alloca-s for non-pointer/integer/borrowed locals ## Smaller changes -- Only generate destructors if needed (removes C warnings) +- Make type ascritpion its own node type + - AST and HIR + - Then remove per-node type annotations from AST - Cache specialisation tree -- Dependency files from mrustc - - Partally done, not complete + - TODO: Profile to determine if this is a slow point. +- Delete HIR after MIR generation + - Problem: HIR is sometimes touched again after MIR gen + - May just be able to do the delete in the main MIR gen phase +- Split types and patterns in HIR function arguments + - Upsides: + - Less data in the serialsed .hir file + - Note: Patterns aren't actually stored in metadata + - Simpler logic post MIR generation + - Reduced use of .first/.second + - Memory usage reduction for external functions + - Downsides: + - Lots of code touched + - Extra complexity in typecheck and MIR lowering? +- Sort trait impls in a similar way to type impls + - ```c++ + template<T> struct ImplGroup { + std::map<SimplePath, T> named; + std::vector<T> primitives; + std::vector<T> generic; + } + std::map<SimplePath, ImplGroup<TraitImpl>> trait_impls; + ``` <!-- `--> + - Problem: Trait lookup on ivars? + - TODO: Profile a large crate again to see what the overhead of searching + trait impls is. + - Should help with widely implemented traits (e.g. Debug) + - Downside: Another SimplePath per impl cluster, plus the vector size - Not + too big but something to be aware of (7 pointers overhead plus indirection). ## Optimisations @@ -32,4 +70,9 @@ TODO: - Dead assignment removal (Delete `<val> = Use(<val>)` - Tuple destructure removal - Spot `#1 = (...,)`, `#2 = (#1).n` where #1 is Write-once, borrow-none +- Remove useless borrows (remove write-once &T lvalues if they're never used by + value - only used via deref) + +<!-- vim: ft=markdown +--> |